openvoice plugin
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitignore +13 -0
- dreamvoice/train_utils/prepare/get_dist.py +49 -0
- dreamvoice/train_utils/prepare/plugin_meta.csv +0 -0
- dreamvoice/train_utils/prepare/prepare_se.py +101 -0
- dreamvoice/train_utils/prepare/prompts.csv +0 -0
- dreamvoice/train_utils/prepare/val_meta.csv +121 -0
- dreamvoice/train_utils/src/configs/plugin.py +44 -0
- dreamvoice/train_utils/src/dataset/__init__.py +1 -0
- dreamvoice/train_utils/src/dataset/dreamvc.py +36 -0
- dreamvoice/train_utils/src/inference.py +114 -0
- dreamvoice/train_utils/src/model/p2e_cross.py +80 -0
- dreamvoice/train_utils/src/model/p2e_cross.yaml +26 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/LICENSE +24 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/README.md +64 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/__init__.py +1 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/audio.py +157 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/config.py +47 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/__init__.py +4 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py +39 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/speaker.py +42 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py +14 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py +58 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/utterance.py +28 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/inference.py +211 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/model.py +137 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/params_data.py +30 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/params_model.py +12 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/preprocess.py +177 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/train.py +127 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/__init__.py +1 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/argutils.py +42 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/logmmse.py +222 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/profiler.py +47 -0
- dreamvoice/train_utils/src/modules/speaker_encoder/encoder/visualizations.py +180 -0
- dreamvoice/train_utils/src/openvoice/__init__.py +0 -0
- dreamvoice/train_utils/src/openvoice/api.py +202 -0
- dreamvoice/train_utils/src/openvoice/attentions.py +465 -0
- dreamvoice/train_utils/src/openvoice/commons.py +160 -0
- dreamvoice/train_utils/src/openvoice/mel_processing.py +183 -0
- dreamvoice/train_utils/src/openvoice/models.py +499 -0
- dreamvoice/train_utils/src/openvoice/modules.py +598 -0
- dreamvoice/train_utils/src/openvoice/openvoice_app.py +275 -0
- dreamvoice/train_utils/src/openvoice/se_extractor.py +153 -0
- dreamvoice/train_utils/src/openvoice/text/__init__.py +79 -0
- dreamvoice/train_utils/src/openvoice/text/cleaners.py +16 -0
- dreamvoice/train_utils/src/openvoice/text/english.py +188 -0
- dreamvoice/train_utils/src/openvoice/text/mandarin.py +326 -0
- dreamvoice/train_utils/src/openvoice/text/symbols.py +88 -0
- dreamvoice/train_utils/src/openvoice/transforms.py +209 -0
- dreamvoice/train_utils/src/openvoice/utils.py +194 -0
.gitignore
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Ignore Jupyter Notebook checkpoints
|
2 |
+
.ipynb_checkpoints/
|
3 |
+
|
4 |
+
# Ignore Python bytecode files
|
5 |
+
*.pyc
|
6 |
+
*.pyo
|
7 |
+
*.pyd
|
8 |
+
__pycache__/
|
9 |
+
|
10 |
+
# Ignore virtual environments
|
11 |
+
venv/
|
12 |
+
env/
|
13 |
+
.virtualenv/
|
dreamvoice/train_utils/prepare/get_dist.py
ADDED
@@ -0,0 +1,49 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import random
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
+
# Function to recursively find all .pt files in a directory
|
8 |
+
def find_pt_files(root_dir):
|
9 |
+
pt_files = []
|
10 |
+
for dirpath, _, filenames in os.walk(root_dir):
|
11 |
+
for file in filenames:
|
12 |
+
if file.endswith('.pt'):
|
13 |
+
pt_files.append(os.path.join(dirpath, file))
|
14 |
+
return pt_files
|
15 |
+
|
16 |
+
|
17 |
+
# Function to compute statistics for a given tensor list
|
18 |
+
def compute_statistics(tensor_list):
|
19 |
+
all_data = torch.cat(tensor_list)
|
20 |
+
mean = torch.mean(all_data).item()
|
21 |
+
std = torch.std(all_data).item()
|
22 |
+
max_val = torch.max(all_data).item()
|
23 |
+
min_val = torch.min(all_data).item()
|
24 |
+
return mean, std, max_val, min_val
|
25 |
+
|
26 |
+
|
27 |
+
# Root directory containing .pt files in subfolders
|
28 |
+
root_dir = "spk"
|
29 |
+
|
30 |
+
# Find all .pt files
|
31 |
+
pt_files = find_pt_files(root_dir)
|
32 |
+
|
33 |
+
# Randomly sample 1000 .pt files (or fewer if less than 1000 files are available)
|
34 |
+
sampled_files = random.sample(pt_files, min(1000, len(pt_files)))
|
35 |
+
|
36 |
+
# Load tensors from sampled files
|
37 |
+
tensor_list = []
|
38 |
+
for file in sampled_files:
|
39 |
+
tensor = torch.load(file)
|
40 |
+
tensor_list.append(tensor.view(-1)) # Flatten the tensor
|
41 |
+
|
42 |
+
# Compute statistics
|
43 |
+
mean, std, max_val, min_val = compute_statistics(tensor_list)
|
44 |
+
|
45 |
+
# Print the results
|
46 |
+
print(f"Mean: {mean}")
|
47 |
+
print(f"Std: {std}")
|
48 |
+
print(f"Max: {max_val}")
|
49 |
+
print(f"Min: {min_val}")
|
dreamvoice/train_utils/prepare/plugin_meta.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
dreamvoice/train_utils/prepare/prepare_se.py
ADDED
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import librosa
|
4 |
+
from tqdm import tqdm
|
5 |
+
from openvoice.api import ToneColorConverter
|
6 |
+
from openvoice.mel_processing import spectrogram_torch
|
7 |
+
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
|
8 |
+
|
9 |
+
|
10 |
+
@torch.no_grad()
|
11 |
+
def se_extractor(audio_path, vc):
|
12 |
+
# vad
|
13 |
+
SAMPLE_RATE = 16000
|
14 |
+
audio_vad = get_audio_tensor(audio_path)
|
15 |
+
segments = get_vad_segments(
|
16 |
+
audio_vad,
|
17 |
+
output_sample=True,
|
18 |
+
min_speech_duration=0.1,
|
19 |
+
min_silence_duration=1,
|
20 |
+
method="silero",
|
21 |
+
)
|
22 |
+
segments = [(seg["start"], seg["end"]) for seg in segments]
|
23 |
+
segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
|
24 |
+
|
25 |
+
if len(segments) == 0:
|
26 |
+
segments = [(0, len(audio_vad)/SAMPLE_RATE)]
|
27 |
+
print(segments)
|
28 |
+
|
29 |
+
# spk
|
30 |
+
hps = vc.hps
|
31 |
+
device = vc.device
|
32 |
+
model = vc.model
|
33 |
+
gs = []
|
34 |
+
|
35 |
+
audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate)
|
36 |
+
audio = torch.tensor(audio).float().to(device)
|
37 |
+
|
38 |
+
for s, e in segments:
|
39 |
+
y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)]
|
40 |
+
y = y.to(device)
|
41 |
+
y = y.unsqueeze(0)
|
42 |
+
y = spectrogram_torch(y, hps.data.filter_length,
|
43 |
+
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
44 |
+
center=False).to(device)
|
45 |
+
g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
|
46 |
+
gs.append(g.detach())
|
47 |
+
|
48 |
+
gs = torch.stack(gs).mean(0)
|
49 |
+
return gs.cpu()
|
50 |
+
|
51 |
+
|
52 |
+
def process_audio_folder(input_folder, output_folder, model, device):
|
53 |
+
"""
|
54 |
+
Process all audio files in a folder and its subfolders,
|
55 |
+
save the extracted features as .pt files in the output folder with the same structure.
|
56 |
+
|
57 |
+
Args:
|
58 |
+
input_folder (str): Path to the input folder containing audio files.
|
59 |
+
output_folder (str): Path to the output folder to save .pt files.
|
60 |
+
model: Pre-trained model for feature extraction.
|
61 |
+
device: Torch device (e.g., 'cpu' or 'cuda').
|
62 |
+
"""
|
63 |
+
# Collect all audio file paths
|
64 |
+
audio_files = []
|
65 |
+
for root, _, files in os.walk(input_folder):
|
66 |
+
for file in files:
|
67 |
+
if file.endswith(('.wav', '.mp3', '.flac')): # Adjust for the audio formats you want to process
|
68 |
+
audio_files.append(os.path.join(root, file))
|
69 |
+
|
70 |
+
# Process each audio file with tqdm for progress
|
71 |
+
for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"):
|
72 |
+
# Construct output path
|
73 |
+
relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder)
|
74 |
+
output_dir = os.path.join(output_folder, relative_path)
|
75 |
+
os.makedirs(output_dir, exist_ok=True)
|
76 |
+
output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt')
|
77 |
+
|
78 |
+
# Check if the .pt file already exists
|
79 |
+
if os.path.exists(output_path):
|
80 |
+
# print(f"Skipped (already exists): {output_path}")
|
81 |
+
continue # Skip processing this file
|
82 |
+
# Extract features
|
83 |
+
target_se = se_extractor(audio_path, model).to(device)
|
84 |
+
# Save the feature as .pt
|
85 |
+
torch.save(target_se, output_path)
|
86 |
+
# print(f"Processed and saved: {output_path}")
|
87 |
+
|
88 |
+
|
89 |
+
if __name__ == '__main__':
|
90 |
+
ckpt_converter = 'checkpoints_v2/converter'
|
91 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
92 |
+
model = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
|
93 |
+
model.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
|
94 |
+
|
95 |
+
input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/'
|
96 |
+
output_folder = 'spk/VCTK-Corpus/'
|
97 |
+
process_audio_folder(input_folder, output_folder, model, device)
|
98 |
+
|
99 |
+
input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360'
|
100 |
+
output_folder = 'spk/LibriTTS-R/train-clean-360/'
|
101 |
+
process_audio_folder(input_folder, output_folder, model, device)
|
dreamvoice/train_utils/prepare/prompts.csv
ADDED
The diff for this file is too large to render.
See raw diff
|
|
dreamvoice/train_utils/prepare/val_meta.csv
ADDED
@@ -0,0 +1,121 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
path,prompt
|
2 |
+
LibriTTS-R/dev-clean/3081/166546/3081_166546_000101_000001.wav,"A gender-ambiguous teenager's voice, bright and smooth, perfect for client and public interaction."
|
3 |
+
LibriTTS-R/dev-clean/3081/166546/3081_166546_000028_000002.wav,"A mature male voice, ideal for delivering creative narratives through oral storytelling."
|
4 |
+
LibriTTS-R/dev-clean/3081/166546/3081_166546_000101_000001.wav,"An adult man's voice, charming and appealing, perfect for captivating storytelling."
|
5 |
+
LibriTTS-R/dev-clean/84/121550/84_121550_000292_000000.wav,"A bright and engaging teenager's voice, suitable for client and public interaction."
|
6 |
+
LibriTTS-R/dev-clean/84/121550/84_121550_000303_000000.wav,An elderly gentleman with a smooth and attractive voice.
|
7 |
+
LibriTTS-R/dev-clean/84/121123/84_121123_000010_000000.wav,"A middle-aged woman with a bright, light voice."
|
8 |
+
LibriTTS-R/dev-clean/5895/34615/5895_34615_000029_000000.wav,A young and gender-neutral teenager's voice.
|
9 |
+
LibriTTS-R/dev-clean/5895/34615/5895_34615_000009_000000.wav,A warm and attractive adult male voice.
|
10 |
+
LibriTTS-R/dev-clean/5895/34622/5895_34622_000018_000001.wav,"A middle-aged male voice, rough and husky."
|
11 |
+
LibriTTS-R/dev-clean/2035/147960/2035_147960_000022_000005.wav,"An elderly male voice with a dark, rough, authoritative, and strong tone."
|
12 |
+
LibriTTS-R/dev-clean/2035/152373/2035_152373_000010_000002.wav,"An elderly male voice, exuding warmth and kindness."
|
13 |
+
LibriTTS-R/dev-clean/2035/147960/2035_147960_000003_000002.wav,"A mature woman's voice, bright and smooth, ideal for client and public interaction."
|
14 |
+
LibriTTS-R/dev-clean/1673/143396/1673_143396_000017_000002.wav,"A deep, rich, and dark voice of an elderly man."
|
15 |
+
LibriTTS-R/dev-clean/1673/143397/1673_143397_000016_000002.wav,A middle-aged man with a masculine voice.
|
16 |
+
LibriTTS-R/dev-clean/1673/143397/1673_143397_000031_000006.wav,"A teenage girl's voice, bright and cute, yet with a hint of weakness, perfect for client and public interaction."
|
17 |
+
LibriTTS-R/dev-clean/2803/154328/2803_154328_000019_000000.wav,"An older man's voice, deep and rich, exuding charm and allure."
|
18 |
+
LibriTTS-R/dev-clean/2803/154320/2803_154320_000007_000000.wav,"An adult female voice that is smooth and captivating, perfect for storytelling."
|
19 |
+
LibriTTS-R/dev-clean/2803/154328/2803_154328_000069_000000.wav,An adult woman's weak voice for client and public interaction.
|
20 |
+
LibriTTS-R/dev-clean/3752/4944/3752_4944_000097_000000.wav,"An elderly male voice with a deep, silky, and charming tone."
|
21 |
+
LibriTTS-R/dev-clean/3752/4944/3752_4944_000009_000000.wav,"A teenage girl's voice, suited for client and public interaction."
|
22 |
+
LibriTTS-R/dev-clean/3752/4943/3752_4943_000061_000000.wav,"A mature male voice, authoritative and commanding, ideal for political negotiation and legal discourse."
|
23 |
+
LibriTTS-R/dev-clean/1919/142785/1919_142785_000038_000001.wav,"An adult male voice, deep and rich, suited for public speaking engagements."
|
24 |
+
LibriTTS-R/dev-clean/1919/142785/1919_142785_000012_000001.wav,"A mature, gender-ambiguous adult voice that is silky and mellow."
|
25 |
+
LibriTTS-R/dev-clean/1919/142785/1919_142785_000035_000003.wav,"A teenage girl's voice, bright and captivating for storytelling."
|
26 |
+
LibriTTS-R/dev-clean/6313/66129/6313_66129_000074_000003.wav,An androgynous elderly voice.
|
27 |
+
LibriTTS-R/dev-clean/6313/76958/6313_76958_000008_000000.wav,"An elderly male voice, deep and rich, with a warm and inviting quality, perfect for storytelling with a heartfelt touch."
|
28 |
+
LibriTTS-R/dev-clean/6313/66129/6313_66129_000031_000000.wav,"An adult male voice, dark and captivating, perfect for storytelling."
|
29 |
+
LibriTTS-R/dev-clean/652/130737/652_130737_000031_000000.wav,"An adult male voice, dark and attractive, exuding warmth and charm."
|
30 |
+
LibriTTS-R/dev-clean/652/130737/652_130737_000031_000000.wav,"A teenage boy with a confident voice, ideal for client and public interaction."
|
31 |
+
LibriTTS-R/dev-clean/652/129742/652_129742_000010_000003.wav,"A mature man's voice, deep and resonant with a touch of twanginess, perfect for customer service and public engagement roles."
|
32 |
+
LibriTTS-R/dev-clean/2902/9008/2902_9008_000026_000003.wav,"An authoritative senior male voice, dark and strong yet warm and comforting."
|
33 |
+
LibriTTS-R/dev-clean/2902/9006/2902_9006_000011_000000.wav,"An elderly male voice with a dark, rough, authoritative, and strong tone."
|
34 |
+
LibriTTS-R/dev-clean/2902/9008/2902_9008_000008_000002.wav,An adult male voice that is bright and authoritative.
|
35 |
+
LibriTTS-R/dev-clean/7976/105575/7976_105575_000006_000001.wav,"A young male voice, charming and sweet."
|
36 |
+
LibriTTS-R/dev-clean/7976/105575/7976_105575_000015_000000.wav,"An elderly man's voice, rough and hoarse."
|
37 |
+
LibriTTS-R/dev-clean/7976/110523/7976_110523_000032_000002.wav,"A mature adult female voice, commanding and assertive."
|
38 |
+
LibriTTS-R/dev-clean/7850/111771/7850_111771_000006_000000.wav,"An elderly male voice with a deep, rich tone that is inviting and heartfelt."
|
39 |
+
LibriTTS-R/dev-clean/7850/281318/7850_281318_000006_000000.wav,A gender-ambiguous teenager's voice.
|
40 |
+
LibriTTS-R/dev-clean/7850/281318/7850_281318_000001_000003.wav,"A middle-aged male voice, feeble and faint."
|
41 |
+
LibriTTS-R/dev-clean/2086/149220/2086_149220_000045_000002.wav,"An adult male voice, smooth and velvety."
|
42 |
+
LibriTTS-R/dev-clean/2086/149220/2086_149220_000006_000012.wav,"An adult woman's voice, bright and smooth, ideal for client and public interaction."
|
43 |
+
LibriTTS-R/dev-clean/2086/149214/2086_149214_000004_000003.wav,A senior male voice that is strong and authoritative.
|
44 |
+
LibriTTS-R/dev-clean/2412/153947/2412_153947_000017_000005.wav,A mature female voice suited for customer service and public engagement.
|
45 |
+
LibriTTS-R/dev-clean/2412/153947/2412_153947_000017_000005.wav,"An adult female voice, bright and warm, perfect for client and public interaction."
|
46 |
+
LibriTTS-R/dev-clean/2412/153954/2412_153954_000006_000003.wav,"A senior male voice with a dark, rough texture."
|
47 |
+
LibriTTS-R/dev-clean/1988/148538/1988_148538_000011_000000.wav,"An adult male voice, dark, authoritative, and strong, perfect for storytelling."
|
48 |
+
LibriTTS-R/dev-clean/1988/147956/1988_147956_000009_000008.wav,"A senior female voice that is smooth, warm, and attractive."
|
49 |
+
LibriTTS-R/dev-clean/1988/24833/1988_24833_000009_000003.wav,A youthful voice with an androgynous and gender-neutral quality.
|
50 |
+
LibriTTS-R/dev-clean/6319/275224/6319_275224_000022_000008.wav,"A female adult voice, perfect for captivating storytelling."
|
51 |
+
LibriTTS-R/dev-clean/6319/275224/6319_275224_000024_000001.wav,A commanding and powerful adult female voice.
|
52 |
+
LibriTTS-R/dev-clean/6319/275224/6319_275224_000022_000009.wav,"A senior male voice with a dark, rough texture."
|
53 |
+
LibriTTS-R/dev-clean/2428/83705/2428_83705_000025_000000.wav,A man's voice in adulthood.
|
54 |
+
LibriTTS-R/dev-clean/2428/83699/2428_83699_000033_000003.wav,"An adult male voice that is warm and attractive, ideal for engaging storytelling."
|
55 |
+
LibriTTS-R/dev-clean/2428/83705/2428_83705_000023_000002.wav,"A young and gender-ambiguous teenager with a rough, hoarse voice."
|
56 |
+
LibriTTS-R/dev-clean/5536/43359/5536_43359_000003_000002.wav,"A mature male voice with a deep, hoarse quality."
|
57 |
+
LibriTTS-R/dev-clean/5536/43359/5536_43359_000023_000000.wav,"A senior man's voice, dark, rough, strong, and authoritative, perfect for storytelling."
|
58 |
+
LibriTTS-R/dev-clean/5536/43359/5536_43359_000010_000002.wav,An adult male with a strong voice.
|
59 |
+
LibriTTS-R/dev-clean/422/122949/422_122949_000001_000000.wav,"An adult man's bright voice, perfect for storytelling."
|
60 |
+
LibriTTS-R/dev-clean/422/122949/422_122949_000013_000010.wav,"An adult woman's voice, smooth and captivating, perfect for storytelling and creative narration."
|
61 |
+
LibriTTS-R/dev-clean/422/122949/422_122949_000001_000000.wav,An older male's voice.
|
62 |
+
LibriTTS-R/dev-clean/251/137823/251_137823_000056_000002.wav,"A senior male voice, dark, rough, authoritative, and attractive."
|
63 |
+
LibriTTS-R/dev-clean/251/137823/251_137823_000030_000000.wav,"An elderly woman with a bright, nasal voice."
|
64 |
+
LibriTTS-R/dev-clean/251/136532/251_136532_000002_000004.wav,"An adult male voice, dark, attractive, and authoritative, perfect for public presentations."
|
65 |
+
LibriTTS-R/dev-clean/3170/137482/3170_137482_000027_000005.wav,"A gender-ambiguous teenager with a cute, sweet voice."
|
66 |
+
LibriTTS-R/dev-clean/3170/137482/3170_137482_000003_000005.wav,"An adult female voice, authoritative and commanding, suited for roles in diplomacy and judiciary."
|
67 |
+
LibriTTS-R/dev-clean/3170/137482/3170_137482_000007_000000.wav,"A senior male voice, authoritative and commanding, perfect for public presentations."
|
68 |
+
LibriTTS-R/dev-clean/174/84280/174_84280_000016_000000.wav,"A senior man's voice, dark, rough, and authoritative for public presentations."
|
69 |
+
LibriTTS-R/dev-clean/174/50561/174_50561_000022_000000.wav,"An adult male voice that is dark, attractive, and warm."
|
70 |
+
LibriTTS-R/dev-clean/174/168635/174_168635_000025_000000.wav,"An adult male voice, bright and engaging, perfect for public presentations."
|
71 |
+
LibriTTS-R/dev-clean/3853/163249/3853_163249_000134_000000.wav,A bright and smooth teenage girl with an attractive voice.
|
72 |
+
LibriTTS-R/dev-clean/3853/163249/3853_163249_000088_000000.wav,"A middle-aged man with a deep, hoarse, and powerful voice."
|
73 |
+
LibriTTS-R/dev-clean/3853/163249/3853_163249_000077_000000.wav,"An adult voice with a gender-ambiguous tone, suitable for client and public interaction."
|
74 |
+
LibriTTS-R/dev-clean/1272/141231/1272_141231_000013_000001.wav,An older male voice with a dark and attractive tone.
|
75 |
+
LibriTTS-R/dev-clean/1272/141231/1272_141231_000027_000005.wav,A teenage boy with a feeble voice.
|
76 |
+
LibriTTS-R/dev-clean/1272/141231/1272_141231_000034_000003.wav,"A mature adult male voice, with a deep, attractive and alluring tone."
|
77 |
+
LibriTTS-R/dev-clean/6295/244435/6295_244435_000014_000000.wav,"A mature man's voice, deep and powerful, with an alluring quality, perfect for storytelling."
|
78 |
+
LibriTTS-R/dev-clean/6295/64301/6295_64301_000009_000003.wav,"A mature male voice with a deep and rich tone, ideal for captivating storytelling."
|
79 |
+
LibriTTS-R/dev-clean/6295/64301/6295_64301_000017_000000.wav,A mature female voice with a hoarse and husky quality.
|
80 |
+
LibriTTS-R/dev-clean/8297/275154/8297_275154_000011_000001.wav,"An elderly voice with a smooth, silky texture."
|
81 |
+
LibriTTS-R/dev-clean/8297/275154/8297_275154_000022_000011.wav,"An adult woman's voice that is bright, smooth, attractive, and warm, perfect for engaging storytelling."
|
82 |
+
LibriTTS-R/dev-clean/8297/275154/8297_275154_000024_000007.wav,"An adult male voice, dark and rough, exuding authority and perfect for public presentations."
|
83 |
+
LibriTTS-R/dev-clean/1462/170138/1462_170138_000019_000002.wav,"A confident and commanding adult female voice, with a smooth and authoritative tone."
|
84 |
+
LibriTTS-R/dev-clean/1462/170138/1462_170138_000003_000004.wav,"An adult male voice, deep and commanding with a sense of authority."
|
85 |
+
LibriTTS-R/dev-clean/1462/170142/1462_170142_000041_000001.wav,"A mature female voice, deep and authoritative."
|
86 |
+
LibriTTS-R/dev-clean/2277/149897/2277_149897_000023_000000.wav,"An adult male voice with a smooth, warm tone and a subtle nasal quality."
|
87 |
+
LibriTTS-R/dev-clean/2277/149896/2277_149896_000013_000000.wav,"An adult man's voice, dark and strong, with an attractive allure, ideal for storytelling."
|
88 |
+
LibriTTS-R/dev-clean/2277/149896/2277_149896_000025_000003.wav,"An adult man's voice, weak yet engaging for client and public interaction."
|
89 |
+
LibriTTS-R/dev-clean/8842/302201/8842_302201_000008_000005.wav,"A teenage boy's voice, nasal and weak, suitable for client and public interaction."
|
90 |
+
LibriTTS-R/dev-clean/8842/304647/8842_304647_000017_000001.wav,"An adult male voice, dark and rough, with an attractive charm suited for diplomacy and judiciary work."
|
91 |
+
LibriTTS-R/dev-clean/8842/302203/8842_302203_000020_000002.wav,"A mature woman's voice, deep and rich, ideal for political negotiation and legal discourse."
|
92 |
+
LibriTTS-R/dev-clean/5338/284437/5338_284437_000054_000002.wav,"A senior male voice, dark and smooth, with an attractive and alluring quality, perfect for storytelling narratives."
|
93 |
+
LibriTTS-R/dev-clean/5338/284437/5338_284437_000034_000000.wav,"An elderly female voice, rough and rugged."
|
94 |
+
LibriTTS-R/dev-clean/5338/284437/5338_284437_000046_000002.wav,"An older male voice, deep and powerful."
|
95 |
+
LibriTTS-R/dev-clean/3576/138058/3576_138058_000051_000000.wav,"An adult female voice, authoritative and commanding, suited for political negotiation and legal discourse."
|
96 |
+
LibriTTS-R/dev-clean/3576/138058/3576_138058_000024_000001.wav,"A young male voice, feeble and faint."
|
97 |
+
LibriTTS-R/dev-clean/3576/138058/3576_138058_000042_000000.wav,"An elderly male voice, dark, strong, and authoritative in tone."
|
98 |
+
LibriTTS-R/dev-clean/6345/93302/6345_93302_000063_000001.wav,"An adult male voice, dark and warm in tone."
|
99 |
+
LibriTTS-R/dev-clean/6345/93302/6345_93302_000069_000000.wav,"An adult male voice, dark and rough yet warm and inviting, perfect for captivating storytelling."
|
100 |
+
LibriTTS-R/dev-clean/6345/64257/6345_64257_000007_000003.wav,"An adult man's voice, dark, warm, and attractive, perfect for engaging storytelling."
|
101 |
+
LibriTTS-R/dev-clean/3000/15664/3000_15664_000006_000002.wav,"A senior male voice, dark and smooth, with an attractive tone, perfect for captivating storytelling."
|
102 |
+
LibriTTS-R/dev-clean/3000/15664/3000_15664_000040_000001.wav,"A mature male voice with a deep, husky and rough texture."
|
103 |
+
LibriTTS-R/dev-clean/3000/15664/3000_15664_000025_000000.wav,A mature and androgynous voice.
|
104 |
+
LibriTTS-R/dev-clean/1993/147966/1993_147966_000011_000003.wav,"An adult woman's voice, dark, smooth, and attractive, perfect for storytelling."
|
105 |
+
LibriTTS-R/dev-clean/1993/147965/1993_147965_000007_000001.wav,"An adult woman's voice, warm and inviting, perfect for creative storytelling."
|
106 |
+
LibriTTS-R/dev-clean/1993/147965/1993_147965_000002_000003.wav,"A teenager with a bright and lively voice, gender-ambiguous."
|
107 |
+
LibriTTS-R/dev-clean/3536/8226/3536_8226_000026_000012.wav,A mature female voice ideal for client and public interaction.
|
108 |
+
LibriTTS-R/dev-clean/3536/23268/3536_23268_000028_000000.wav,"An adult voice with a gender-ambiguous tone, bright and smooth."
|
109 |
+
LibriTTS-R/dev-clean/3536/8226/3536_8226_000026_000009.wav,"A mature male voice, ideal for delivering creative narratives through oral storytelling."
|
110 |
+
LibriTTS-R/dev-clean/5694/64029/5694_64029_000028_000001.wav,"A middle-aged man's attractive voice, perfect for captivating storytelling."
|
111 |
+
LibriTTS-R/dev-clean/5694/64038/5694_64038_000005_000000.wav,An elderly male voice that is authoritative and strong.
|
112 |
+
LibriTTS-R/dev-clean/5694/64025/5694_64025_000003_000000.wav,"An elderly male voice, authoritative and commanding in tone."
|
113 |
+
LibriTTS-R/dev-clean/6241/61943/6241_61943_000020_000000.wav,"An adult man's voice, dark and authoritative, ideal for diplomacy and judiciary roles."
|
114 |
+
LibriTTS-R/dev-clean/6241/61946/6241_61946_000043_000000.wav,A young boy's voice with a hoarse and husky tone.
|
115 |
+
LibriTTS-R/dev-clean/6241/61943/6241_61943_000039_000004.wav,"A mature, gender-neutral adult voice, smooth and perfect for storytelling."
|
116 |
+
LibriTTS-R/dev-clean/2078/142845/2078_142845_000018_000000.wav,"A teenage girl's sweet and charming voice, perfect for customer service and public engagement."
|
117 |
+
LibriTTS-R/dev-clean/2078/142845/2078_142845_000049_000000.wav,"An adult male voice that is dark, rough, attractive, and authoritative."
|
118 |
+
LibriTTS-R/dev-clean/2078/142845/2078_142845_000052_000000.wav,"An adult female voice, smooth and silky, perfect for customer service and public engagement roles."
|
119 |
+
LibriTTS-R/dev-clean/777/126732/777_126732_000076_000007.wav,"A senior male voice, characterized by a dark and weak tone."
|
120 |
+
LibriTTS-R/dev-clean/777/126732/777_126732_000076_000006.wav,"A senior man with a dark, authoritative, and strong voice, suited for diplomacy and judiciary professions."
|
121 |
+
LibriTTS-R/dev-clean/777/126732/777_126732_000076_000007.wav,"A mature male voice with a nasal, twangy quality."
|
dreamvoice/train_utils/src/configs/plugin.py
ADDED
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import numpy as np
|
2 |
+
|
3 |
+
|
4 |
+
class AttrDict(dict):
|
5 |
+
def __init__(self, *args, **kwargs):
|
6 |
+
super(AttrDict, self).__init__(*args, **kwargs)
|
7 |
+
self.__dict__ = self
|
8 |
+
|
9 |
+
def override(self, attrs):
|
10 |
+
if isinstance(attrs, dict):
|
11 |
+
self.__dict__.update(**attrs)
|
12 |
+
elif isinstance(attrs, (list, tuple, set)):
|
13 |
+
for attr in attrs:
|
14 |
+
self.override(attr)
|
15 |
+
elif attrs is not None:
|
16 |
+
raise NotImplementedError
|
17 |
+
return self
|
18 |
+
|
19 |
+
|
20 |
+
all_params = {
|
21 |
+
'Plugin_base': AttrDict(
|
22 |
+
# Diff params
|
23 |
+
diff=AttrDict(
|
24 |
+
num_train_steps=1000,
|
25 |
+
beta_start=1e-4,
|
26 |
+
beta_end=0.02,
|
27 |
+
num_infer_steps=50,
|
28 |
+
v_prediction=True,
|
29 |
+
),
|
30 |
+
|
31 |
+
text_encoder=AttrDict(
|
32 |
+
model='google/flan-t5-base'
|
33 |
+
),
|
34 |
+
opt=AttrDict(
|
35 |
+
learning_rate=1e-4,
|
36 |
+
beta1=0.9,
|
37 |
+
beta2=0.999,
|
38 |
+
weight_decay=1e-4,
|
39 |
+
adam_epsilon=1e-08,
|
40 |
+
),),
|
41 |
+
}
|
42 |
+
|
43 |
+
def get_params(name):
|
44 |
+
return all_params[name]
|
dreamvoice/train_utils/src/dataset/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .vcdata import VCData
|
dreamvoice/train_utils/src/dataset/dreamvc.py
ADDED
@@ -0,0 +1,36 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import os
|
3 |
+
import random
|
4 |
+
import ast
|
5 |
+
import numpy as np
|
6 |
+
import torch
|
7 |
+
from einops import repeat, rearrange
|
8 |
+
import librosa
|
9 |
+
|
10 |
+
from torch.utils.data import Dataset
|
11 |
+
import torchaudio
|
12 |
+
|
13 |
+
|
14 |
+
class DreamData(Dataset):
|
15 |
+
def __init__(self, data_dir, meta_dir, subset, prompt_dir,):
|
16 |
+
self.datadir = data_dir
|
17 |
+
meta = pd.read_csv(meta_dir)
|
18 |
+
self.meta = meta[meta['subset'] == subset]
|
19 |
+
self.subset = subset
|
20 |
+
self.prompts = pd.read_csv(prompt_dir)
|
21 |
+
|
22 |
+
def __getitem__(self, index):
|
23 |
+
row = self.meta.iloc[index]
|
24 |
+
|
25 |
+
# get spk
|
26 |
+
spk_path = self.datadir + row['spk_path']
|
27 |
+
spk = torch.load(spk_path, map_location='cpu').squeeze(0)
|
28 |
+
|
29 |
+
speaker = row['speaker']
|
30 |
+
|
31 |
+
# get prompt
|
32 |
+
prompt = self.prompts[self.prompts['speaker_id'] == str(speaker)].sample(1)['prompt'].iloc[0]
|
33 |
+
return spk, prompt
|
34 |
+
|
35 |
+
def __len__(self):
|
36 |
+
return len(self.meta)
|
dreamvoice/train_utils/src/inference.py
ADDED
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import soundfile as sf
|
4 |
+
import pandas as pd
|
5 |
+
from tqdm import tqdm
|
6 |
+
from utils import minmax_norm_diff, reverse_minmax_norm_diff
|
7 |
+
from spk_ext import se_extractor
|
8 |
+
|
9 |
+
|
10 |
+
def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
|
11 |
+
"""
|
12 |
+
Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
|
13 |
+
Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
|
14 |
+
"""
|
15 |
+
std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
|
16 |
+
std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
|
17 |
+
# rescale the results from guidance (fixes overexposure)
|
18 |
+
noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
|
19 |
+
# mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
|
20 |
+
noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
|
21 |
+
return noise_cfg
|
22 |
+
|
23 |
+
|
24 |
+
@torch.no_grad()
|
25 |
+
def inference_timbre(gen_shape, text,
|
26 |
+
model, scheduler,
|
27 |
+
guidance_scale=5, guidance_rescale=0.7,
|
28 |
+
ddim_steps=50, eta=1, random_seed=2023,
|
29 |
+
device='cuda',
|
30 |
+
):
|
31 |
+
text, text_mask = text
|
32 |
+
model.eval()
|
33 |
+
generator = torch.Generator(device=device).manual_seed(random_seed)
|
34 |
+
scheduler.set_timesteps(ddim_steps)
|
35 |
+
|
36 |
+
# init noise
|
37 |
+
noise = torch.randn(gen_shape, generator=generator, device=device)
|
38 |
+
latents = noise
|
39 |
+
|
40 |
+
for t in scheduler.timesteps:
|
41 |
+
latents = scheduler.scale_model_input(latents, t)
|
42 |
+
|
43 |
+
if guidance_scale:
|
44 |
+
output_text = model(latents, t, text, text_mask, train_cfg=False)
|
45 |
+
output_uncond = model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0)
|
46 |
+
|
47 |
+
output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
|
48 |
+
if guidance_rescale > 0.0:
|
49 |
+
output_pred = rescale_noise_cfg(output_pred, output_text,
|
50 |
+
guidance_rescale=guidance_rescale)
|
51 |
+
else:
|
52 |
+
output_pred = model(latents, t, text, text_mask, train_cfg=False)
|
53 |
+
|
54 |
+
latents = scheduler.step(model_output=output_pred, timestep=t, sample=latents,
|
55 |
+
eta=eta, generator=generator).prev_sample
|
56 |
+
|
57 |
+
# pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5)
|
58 |
+
# pred = torch.clip(pred, min=0.0, max=0.5)
|
59 |
+
return latents
|
60 |
+
|
61 |
+
|
62 |
+
@torch.no_grad()
|
63 |
+
def eval_plugin_light(vc_model, text_model,
|
64 |
+
timbre_model, timbre_scheduler, timbre_shape,
|
65 |
+
val_meta, val_folder,
|
66 |
+
guidance_scale=3, guidance_rescale=0.7,
|
67 |
+
ddim_steps=50, eta=1, random_seed=2024,
|
68 |
+
device='cuda',
|
69 |
+
epoch=0, save_path='logs/eval/', val_num=10, sr=24000):
|
70 |
+
|
71 |
+
tokenizer, text_encoder = text_model
|
72 |
+
|
73 |
+
df = pd.read_csv(val_meta)
|
74 |
+
|
75 |
+
save_path = save_path + str(epoch) + '/'
|
76 |
+
os.makedirs(save_path, exist_ok=True)
|
77 |
+
|
78 |
+
step = 0
|
79 |
+
|
80 |
+
for i in range(len(df)):
|
81 |
+
row = df.iloc[i]
|
82 |
+
|
83 |
+
source_path = val_folder + row['path']
|
84 |
+
prompt = [row['prompt']]
|
85 |
+
|
86 |
+
with torch.no_grad():
|
87 |
+
text_batch = tokenizer(prompt,
|
88 |
+
max_length=32,
|
89 |
+
padding='max_length', truncation=True, return_tensors="pt")
|
90 |
+
text, text_mask = text_batch.input_ids.to(device), \
|
91 |
+
text_batch.attention_mask.to(device)
|
92 |
+
text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
|
93 |
+
|
94 |
+
spk_embed = inference_timbre(timbre_shape, [text, text_mask],
|
95 |
+
timbre_model, timbre_scheduler,
|
96 |
+
guidance_scale=guidance_scale, guidance_rescale=guidance_rescale,
|
97 |
+
ddim_steps=ddim_steps, eta=eta, random_seed=random_seed,
|
98 |
+
device=device)
|
99 |
+
|
100 |
+
source_se = se_extractor(source_path, vc_model).to(device)
|
101 |
+
# print(source_se.shape)
|
102 |
+
# print(spk_embed.shape)
|
103 |
+
|
104 |
+
encode_message = "@MyShell"
|
105 |
+
vc_model.convert(
|
106 |
+
audio_src_path=source_path,
|
107 |
+
src_se=source_se,
|
108 |
+
tgt_se=spk_embed,
|
109 |
+
output_path=save_path + f'{step}_{prompt[0]}' + '.wav',
|
110 |
+
message=encode_message)
|
111 |
+
|
112 |
+
step += 1
|
113 |
+
if step >= val_num:
|
114 |
+
break
|
dreamvoice/train_utils/src/model/p2e_cross.py
ADDED
@@ -0,0 +1,80 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.nn as nn
|
3 |
+
from diffusers import UNet2DModel, UNet2DConditionModel
|
4 |
+
import yaml
|
5 |
+
from einops import repeat, rearrange
|
6 |
+
|
7 |
+
from typing import Any
|
8 |
+
from torch import Tensor
|
9 |
+
|
10 |
+
|
11 |
+
def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
|
12 |
+
if proba == 1:
|
13 |
+
return torch.ones(shape, device=device, dtype=torch.bool)
|
14 |
+
elif proba == 0:
|
15 |
+
return torch.zeros(shape, device=device, dtype=torch.bool)
|
16 |
+
else:
|
17 |
+
return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
|
18 |
+
|
19 |
+
|
20 |
+
class FixedEmbedding(nn.Module):
|
21 |
+
def __init__(self, features=128):
|
22 |
+
super().__init__()
|
23 |
+
self.embedding = nn.Embedding(1, features)
|
24 |
+
|
25 |
+
def forward(self, y):
|
26 |
+
B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device
|
27 |
+
embed = self.embedding(torch.zeros(B, device=device).long())
|
28 |
+
fixed_embedding = repeat(embed, "b c -> b l c", l=L)
|
29 |
+
return fixed_embedding
|
30 |
+
|
31 |
+
|
32 |
+
class P2E_Cross(nn.Module):
|
33 |
+
def __init__(self, config):
|
34 |
+
super().__init__()
|
35 |
+
self.config = config
|
36 |
+
self.unet = UNet2DConditionModel(**self.config['unet'])
|
37 |
+
self.unet.set_use_memory_efficient_attention_xformers(True)
|
38 |
+
self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim'])
|
39 |
+
|
40 |
+
self.context_embedding = nn.Sequential(
|
41 |
+
nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']),
|
42 |
+
nn.SiLU(),
|
43 |
+
nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']))
|
44 |
+
|
45 |
+
def forward(self, target, t, prompt, prompt_mask=None,
|
46 |
+
train_cfg=False, cfg_prob=0.0):
|
47 |
+
target = target.unsqueeze(-1)
|
48 |
+
B, C, _, _ = target.shape
|
49 |
+
|
50 |
+
if train_cfg:
|
51 |
+
if cfg_prob > 0.0:
|
52 |
+
# Randomly mask embedding
|
53 |
+
batch_mask = rand_bool(shape=(B, 1, 1), proba=cfg_prob, device=target.device)
|
54 |
+
fixed_embedding = self.cfg_embedding(prompt).to(target.dtype)
|
55 |
+
prompt = torch.where(batch_mask, fixed_embedding, prompt)
|
56 |
+
|
57 |
+
prompt = self.context_embedding(prompt)
|
58 |
+
# fix the bug that prompt will copy dtype from target in diffusers
|
59 |
+
target = target.to(prompt.dtype)
|
60 |
+
|
61 |
+
output = self.unet(sample=target, timestep=t,
|
62 |
+
encoder_hidden_states=prompt,
|
63 |
+
encoder_attention_mask=prompt_mask)['sample']
|
64 |
+
|
65 |
+
return output.squeeze(-1)
|
66 |
+
|
67 |
+
|
68 |
+
if __name__ == "__main__":
|
69 |
+
with open('p2e_cross.yaml', 'r') as fp:
|
70 |
+
config = yaml.safe_load(fp)
|
71 |
+
device = 'cuda'
|
72 |
+
|
73 |
+
model = P2E_Cross(config['diffwrap']).to(device)
|
74 |
+
|
75 |
+
x = torch.rand((2, 256)).to(device)
|
76 |
+
t = torch.randint(0, 1000, (2,)).long().to(device)
|
77 |
+
prompt = torch.rand(2, 64, 768).to(device)
|
78 |
+
prompt_mask = torch.ones(2, 64).to(device)
|
79 |
+
|
80 |
+
output = model(x, t, prompt, prompt_mask, train_cfg=True, cfg_prob=0.25)
|
dreamvoice/train_utils/src/model/p2e_cross.yaml
ADDED
@@ -0,0 +1,26 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: 1.0
|
2 |
+
|
3 |
+
system: "cross"
|
4 |
+
|
5 |
+
diffwrap:
|
6 |
+
cls_embedding:
|
7 |
+
content_dim: 768
|
8 |
+
content_hidden: 256
|
9 |
+
|
10 |
+
unet:
|
11 |
+
sample_size: [1, 1]
|
12 |
+
in_channels: 256
|
13 |
+
out_channels: 256
|
14 |
+
layers_per_block: 2
|
15 |
+
block_out_channels: [256]
|
16 |
+
down_block_types:
|
17 |
+
[
|
18 |
+
"CrossAttnDownBlock2D",
|
19 |
+
]
|
20 |
+
up_block_types:
|
21 |
+
[
|
22 |
+
"CrossAttnUpBlock2D",
|
23 |
+
]
|
24 |
+
attention_head_dim: 32
|
25 |
+
cross_attention_dim: 768
|
26 |
+
|
dreamvoice/train_utils/src/modules/speaker_encoder/LICENSE
ADDED
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
MIT License
|
2 |
+
|
3 |
+
Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
|
4 |
+
Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
|
5 |
+
Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
|
6 |
+
Original work Copyright (c) 2015 braindead (https://github.com/braindead)
|
7 |
+
|
8 |
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
9 |
+
of this software and associated documentation files (the "Software"), to deal
|
10 |
+
in the Software without restriction, including without limitation the rights
|
11 |
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12 |
+
copies of the Software, and to permit persons to whom the Software is
|
13 |
+
furnished to do so, subject to the following conditions:
|
14 |
+
|
15 |
+
The above copyright notice and this permission notice shall be included in all
|
16 |
+
copies or substantial portions of the Software.
|
17 |
+
|
18 |
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
19 |
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
20 |
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
21 |
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
22 |
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
23 |
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
24 |
+
SOFTWARE.
|
dreamvoice/train_utils/src/modules/speaker_encoder/README.md
ADDED
@@ -0,0 +1,64 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Real-Time Voice Cloning
|
2 |
+
This repository is an implementation of [Transfer Learning from Speaker Verification to
|
3 |
+
Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801).
|
4 |
+
|
5 |
+
SV2TTS is a deep learning framework in three stages. In the first stage, one creates a digital representation of a voice from a few seconds of audio. In the second and third stages, this representation is used as reference to generate speech given arbitrary text.
|
6 |
+
|
7 |
+
**Video demonstration** (click the picture):
|
8 |
+
|
9 |
+
[![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA)
|
10 |
+
|
11 |
+
|
12 |
+
|
13 |
+
### Papers implemented
|
14 |
+
| URL | Designation | Title | Implementation source |
|
15 |
+
| --- | ----------- | ----- | --------------------- |
|
16 |
+
|[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
|
17 |
+
|[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
|
18 |
+
|[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
|
19 |
+
|[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
|
20 |
+
|
21 |
+
## News
|
22 |
+
**10/01/22**: I recommend checking out [CoquiTTS](https://github.com/coqui-ai/tts). It's a good and up-to-date TTS repository targeted for the ML community. It can also do voice cloning and more, such as cross-language cloning or voice conversion.
|
23 |
+
|
24 |
+
**28/12/21**: I've done a [major maintenance update](https://github.com/CorentinJ/Real-Time-Voice-Cloning/pull/961). Mostly, I've worked on making setup easier. Find new instructions in the section below.
|
25 |
+
|
26 |
+
**14/02/21**: This repo now runs on PyTorch instead of Tensorflow, thanks to the help of @bluefish.
|
27 |
+
|
28 |
+
**13/11/19**: I'm now working full time and I will rarely maintain this repo anymore. To anyone who reads this:
|
29 |
+
- **If you just want to clone your voice (and not someone else's):** I recommend our free plan on [Resemble.AI](https://www.resemble.ai/). You will get a better voice quality and less prosody errors.
|
30 |
+
- **If this is not your case:** proceed with this repository, but you might end up being disappointed by the results. If you're planning to work on a serious project, my strong advice: find another TTS repo. Go [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/364) for more info.
|
31 |
+
|
32 |
+
**20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder (inference only). You can use your trained encoder models from this repo with it.
|
33 |
+
|
34 |
+
|
35 |
+
## Setup
|
36 |
+
|
37 |
+
### 1. Install Requirements
|
38 |
+
1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory.
|
39 |
+
2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional.
|
40 |
+
3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files.
|
41 |
+
4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command.
|
42 |
+
5. Install the remaining requirements with `pip install -r requirements.txt`
|
43 |
+
|
44 |
+
### 2. (Optional) Download Pretrained Models
|
45 |
+
Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models).
|
46 |
+
|
47 |
+
### 3. (Optional) Test Configuration
|
48 |
+
Before you download any dataset, you can begin by testing your configuration with:
|
49 |
+
|
50 |
+
`python demo_cli.py`
|
51 |
+
|
52 |
+
If all tests pass, you're good to go.
|
53 |
+
|
54 |
+
### 4. (Optional) Download Datasets
|
55 |
+
For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `<datasets_root>/LibriSpeech/train-clean-100` where `<datasets_root>` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox.
|
56 |
+
|
57 |
+
### 5. Launch the Toolbox
|
58 |
+
You can then try the toolbox:
|
59 |
+
|
60 |
+
`python demo_toolbox.py -d <datasets_root>`
|
61 |
+
or
|
62 |
+
`python demo_toolbox.py`
|
63 |
+
|
64 |
+
depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590).
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/audio.py
ADDED
@@ -0,0 +1,157 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
from scipy.ndimage.morphology import binary_dilation
|
4 |
+
from .params_data import *
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import Optional, Union
|
7 |
+
import numpy as np
|
8 |
+
import webrtcvad
|
9 |
+
import librosa
|
10 |
+
import struct
|
11 |
+
|
12 |
+
import torch
|
13 |
+
from torchaudio.transforms import Resample
|
14 |
+
from librosa.filters import mel as librosa_mel_fn
|
15 |
+
|
16 |
+
|
17 |
+
int16_max = (2 ** 15) - 1
|
18 |
+
|
19 |
+
|
20 |
+
def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
|
21 |
+
source_sr: Optional[int] = None):
|
22 |
+
"""
|
23 |
+
Applies the preprocessing operations used in training the Speaker Encoder to a waveform
|
24 |
+
either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
|
25 |
+
|
26 |
+
:param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
|
27 |
+
just .wav), either the waveform as a numpy array of floats.
|
28 |
+
:param source_sr: if passing an audio waveform, the sampling rate of the waveform before
|
29 |
+
preprocessing. After preprocessing, the waveform's sampling rate will match the data
|
30 |
+
hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
|
31 |
+
this argument will be ignored.
|
32 |
+
"""
|
33 |
+
# Load the wav from disk if needed
|
34 |
+
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
|
35 |
+
wav, source_sr = librosa.load(fpath_or_wav, sr=None)
|
36 |
+
else:
|
37 |
+
wav = fpath_or_wav
|
38 |
+
|
39 |
+
# Resample the wav if needed
|
40 |
+
if source_sr is not None and source_sr != sampling_rate:
|
41 |
+
wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate)
|
42 |
+
|
43 |
+
# Apply the preprocessing: normalize volume and shorten long silences
|
44 |
+
wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
|
45 |
+
wav = trim_long_silences(wav)
|
46 |
+
|
47 |
+
return wav
|
48 |
+
|
49 |
+
|
50 |
+
def preprocess_wav_batch(wavs, source_sr=22050):
|
51 |
+
# This torch version is designed to cope with a batch of same lengths wavs
|
52 |
+
if sampling_rate != source_sr:
|
53 |
+
resample = Resample(source_sr, sampling_rate)
|
54 |
+
wavs = resample(wavs)
|
55 |
+
wavs_preprocessed = normalize_volume_batch(wavs, audio_norm_target_dBFS,
|
56 |
+
increase_only=True)
|
57 |
+
# Trimming silence is not implemented in this version yet!
|
58 |
+
return wavs_preprocessed
|
59 |
+
|
60 |
+
|
61 |
+
def wav_to_mel_spectrogram(wav):
|
62 |
+
"""
|
63 |
+
Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
|
64 |
+
Note: this not a log-mel spectrogram.
|
65 |
+
"""
|
66 |
+
frames = librosa.feature.melspectrogram(
|
67 |
+
y=wav,
|
68 |
+
sr=sampling_rate,
|
69 |
+
n_fft=int(sampling_rate * mel_window_length / 1000),
|
70 |
+
hop_length=int(sampling_rate * mel_window_step / 1000),
|
71 |
+
n_mels=mel_n_channels
|
72 |
+
)
|
73 |
+
return frames.astype(np.float32).T
|
74 |
+
|
75 |
+
|
76 |
+
def wav_to_mel_spectrogram_batch(wavs):
|
77 |
+
# This torch version is designed to cope with a batch of same lengths wavs
|
78 |
+
n_fft = int(sampling_rate * mel_window_length / 1000)
|
79 |
+
hop_length = int(sampling_rate * mel_window_step / 1000)
|
80 |
+
win_length = int(sampling_rate * mel_window_length / 1000)
|
81 |
+
window = torch.hann_window(n_fft).to(wavs)
|
82 |
+
mel_basis = torch.from_numpy(librosa_mel_fn(sr=sampling_rate, n_fft=n_fft,
|
83 |
+
n_mels=mel_n_channels)).to(wavs)
|
84 |
+
s = torch.stft(wavs, n_fft=n_fft, hop_length=hop_length,
|
85 |
+
win_length=win_length, window=window, center=True, return_complex=False)
|
86 |
+
real_part, imag_part = s.unbind(-1)
|
87 |
+
stftm = real_part**2 + imag_part**2
|
88 |
+
mels = torch.matmul(mel_basis, stftm)
|
89 |
+
return torch.transpose(mels, 1, 2)
|
90 |
+
|
91 |
+
|
92 |
+
def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
|
93 |
+
if increase_only and decrease_only:
|
94 |
+
raise ValueError("Both increase only and decrease only are set")
|
95 |
+
dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
|
96 |
+
if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
|
97 |
+
return wav
|
98 |
+
return wav * (10 ** (dBFS_change / 20))
|
99 |
+
|
100 |
+
|
101 |
+
def normalize_volume_batch(wavs, target_dBFS, increase_only=False, decrease_only=False):
|
102 |
+
# This torch version is designed to cope with a batch of same lengths wavs
|
103 |
+
if increase_only and decrease_only:
|
104 |
+
raise ValueError("Both increase only and decrease only are set")
|
105 |
+
dBFS_change = target_dBFS - 10 * torch.log10(torch.mean(wavs ** 2, axis=-1))
|
106 |
+
scales = torch.ones(wavs.shape[0], device=wavs.device, dtype=wavs.dtype)
|
107 |
+
if increase_only:
|
108 |
+
mask = (dBFS_change > 0).to(scales)
|
109 |
+
elif decrease_only:
|
110 |
+
mask = (dBFS_change < 0).to(scales)
|
111 |
+
else:
|
112 |
+
mask = torch.zeros_like(scales)
|
113 |
+
scales = scales + mask * (10 ** (dBFS_change / 20) - 1.0)
|
114 |
+
return wavs * scales.unsqueeze(-1)
|
115 |
+
|
116 |
+
|
117 |
+
def trim_long_silences(wav):
|
118 |
+
"""
|
119 |
+
Ensures that segments without voice in the waveform remain no longer than a
|
120 |
+
threshold determined by the VAD parameters in params.py.
|
121 |
+
|
122 |
+
:param wav: the raw waveform as a numpy array of floats
|
123 |
+
:return: the same waveform with silences trimmed away (length <= original wav length)
|
124 |
+
"""
|
125 |
+
# Compute the voice detection window size
|
126 |
+
samples_per_window = (vad_window_length * sampling_rate) // 1000
|
127 |
+
|
128 |
+
# Trim the end of the audio to have a multiple of the window size
|
129 |
+
wav = wav[:len(wav) - (len(wav) % samples_per_window)]
|
130 |
+
|
131 |
+
# Convert the float waveform to 16-bit mono PCM
|
132 |
+
pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
|
133 |
+
|
134 |
+
# Perform voice activation detection
|
135 |
+
voice_flags = []
|
136 |
+
vad = webrtcvad.Vad(mode=3)
|
137 |
+
for window_start in range(0, len(wav), samples_per_window):
|
138 |
+
window_end = window_start + samples_per_window
|
139 |
+
voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
|
140 |
+
sample_rate=sampling_rate))
|
141 |
+
voice_flags = np.array(voice_flags)
|
142 |
+
|
143 |
+
# Smooth the voice detection with a moving average
|
144 |
+
def moving_average(array, width):
|
145 |
+
array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
|
146 |
+
ret = np.cumsum(array_padded, dtype=float)
|
147 |
+
ret[width:] = ret[width:] - ret[:-width]
|
148 |
+
return ret[width - 1:] / width
|
149 |
+
|
150 |
+
audio_mask = moving_average(voice_flags, vad_moving_average_width)
|
151 |
+
audio_mask = np.round(audio_mask).astype(np.bool)
|
152 |
+
|
153 |
+
# Dilate the voiced regions
|
154 |
+
audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
|
155 |
+
audio_mask = np.repeat(audio_mask, samples_per_window)
|
156 |
+
|
157 |
+
return wav[audio_mask == True]
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/config.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
librispeech_datasets = {
|
4 |
+
"train": {
|
5 |
+
"clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
|
6 |
+
"other": ["LibriSpeech/train-other-500"]
|
7 |
+
},
|
8 |
+
"test": {
|
9 |
+
"clean": ["LibriSpeech/test-clean"],
|
10 |
+
"other": ["LibriSpeech/test-other"]
|
11 |
+
},
|
12 |
+
"dev": {
|
13 |
+
"clean": ["LibriSpeech/dev-clean"],
|
14 |
+
"other": ["LibriSpeech/dev-other"]
|
15 |
+
},
|
16 |
+
}
|
17 |
+
libritts_datasets = {
|
18 |
+
"train": {
|
19 |
+
"clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
|
20 |
+
"other": ["LibriTTS/train-other-500"]
|
21 |
+
},
|
22 |
+
"test": {
|
23 |
+
"clean": ["LibriTTS/test-clean"],
|
24 |
+
"other": ["LibriTTS/test-other"]
|
25 |
+
},
|
26 |
+
"dev": {
|
27 |
+
"clean": ["LibriTTS/dev-clean"],
|
28 |
+
"other": ["LibriTTS/dev-other"]
|
29 |
+
},
|
30 |
+
}
|
31 |
+
voxceleb_datasets = {
|
32 |
+
"voxceleb1" : {
|
33 |
+
"train": ["VoxCeleb1/wav"],
|
34 |
+
"test": ["VoxCeleb1/test_wav"]
|
35 |
+
},
|
36 |
+
"voxceleb2" : {
|
37 |
+
"train": ["VoxCeleb2/dev/aac"],
|
38 |
+
"test": ["VoxCeleb2/test_wav"]
|
39 |
+
}
|
40 |
+
}
|
41 |
+
|
42 |
+
other_datasets = [
|
43 |
+
"LJSpeech-1.1",
|
44 |
+
"VCTK-Corpus/wav48",
|
45 |
+
]
|
46 |
+
|
47 |
+
anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/__init__.py
ADDED
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
from .speaker_verification_dataset import SpeakerVerificationDataset
|
4 |
+
from .speaker_verification_dataset import SpeakerVerificationDataLoader
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
import random
|
4 |
+
|
5 |
+
class RandomCycler:
|
6 |
+
"""
|
7 |
+
Creates an internal copy of a sequence and allows access to its items in a constrained random
|
8 |
+
order. For a source sequence of n items and one or several consecutive queries of a total
|
9 |
+
of m items, the following guarantees hold (one implies the other):
|
10 |
+
- Each item will be returned between m // n and ((m - 1) // n) + 1 times.
|
11 |
+
- Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
|
12 |
+
"""
|
13 |
+
|
14 |
+
def __init__(self, source):
|
15 |
+
if len(source) == 0:
|
16 |
+
raise Exception("Can't create RandomCycler from an empty collection")
|
17 |
+
self.all_items = list(source)
|
18 |
+
self.next_items = []
|
19 |
+
|
20 |
+
def sample(self, count: int):
|
21 |
+
shuffle = lambda l: random.sample(l, len(l))
|
22 |
+
|
23 |
+
out = []
|
24 |
+
while count > 0:
|
25 |
+
if count >= len(self.all_items):
|
26 |
+
out.extend(shuffle(list(self.all_items)))
|
27 |
+
count -= len(self.all_items)
|
28 |
+
continue
|
29 |
+
n = min(count, len(self.next_items))
|
30 |
+
out.extend(self.next_items[:n])
|
31 |
+
count -= n
|
32 |
+
self.next_items = self.next_items[n:]
|
33 |
+
if len(self.next_items) == 0:
|
34 |
+
self.next_items = shuffle(list(self.all_items))
|
35 |
+
return out
|
36 |
+
|
37 |
+
def __next__(self):
|
38 |
+
return self.sample(1)[0]
|
39 |
+
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/speaker.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
from .random_cycler import RandomCycler
|
4 |
+
from .utterance import Utterance
|
5 |
+
from pathlib import Path
|
6 |
+
|
7 |
+
# Contains the set of utterances of a single speaker
|
8 |
+
class Speaker:
|
9 |
+
def __init__(self, root: Path):
|
10 |
+
self.root = root
|
11 |
+
self.name = root.name
|
12 |
+
self.utterances = None
|
13 |
+
self.utterance_cycler = None
|
14 |
+
|
15 |
+
def _load_utterances(self):
|
16 |
+
with self.root.joinpath("_sources.txt").open("r") as sources_file:
|
17 |
+
sources = [l.split(",") for l in sources_file]
|
18 |
+
sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
|
19 |
+
self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
|
20 |
+
self.utterance_cycler = RandomCycler(self.utterances)
|
21 |
+
|
22 |
+
def random_partial(self, count, n_frames):
|
23 |
+
"""
|
24 |
+
Samples a batch of <count> unique partial utterances from the disk in a way that all
|
25 |
+
utterances come up at least once every two cycles and in a random order every time.
|
26 |
+
|
27 |
+
:param count: The number of partial utterances to sample from the set of utterances from
|
28 |
+
that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than
|
29 |
+
the number of utterances available.
|
30 |
+
:param n_frames: The number of frames in the partial utterance.
|
31 |
+
:return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
|
32 |
+
frames are the frames of the partial utterances and range is the range of the partial
|
33 |
+
utterance with regard to the complete utterance.
|
34 |
+
"""
|
35 |
+
if self.utterances is None:
|
36 |
+
self._load_utterances()
|
37 |
+
|
38 |
+
utterances = self.utterance_cycler.sample(count)
|
39 |
+
|
40 |
+
a = [(u,) + u.random_partial(n_frames) for u in utterances]
|
41 |
+
|
42 |
+
return a
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
from typing import List
|
5 |
+
from .speaker import Speaker
|
6 |
+
|
7 |
+
class SpeakerBatch:
|
8 |
+
def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
|
9 |
+
self.speakers = speakers
|
10 |
+
self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
|
11 |
+
|
12 |
+
# Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
|
13 |
+
# 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
|
14 |
+
self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
from .random_cycler import RandomCycler
|
4 |
+
from .speaker_batch import SpeakerBatch
|
5 |
+
from .speaker import Speaker
|
6 |
+
from ..params_data import partials_n_frames
|
7 |
+
from torch.utils.data import Dataset, DataLoader
|
8 |
+
from pathlib import Path
|
9 |
+
|
10 |
+
# TODO: improve with a pool of speakers for data efficiency
|
11 |
+
|
12 |
+
class SpeakerVerificationDataset(Dataset):
|
13 |
+
def __init__(self, datasets_root: Path):
|
14 |
+
self.root = datasets_root
|
15 |
+
speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
|
16 |
+
if len(speaker_dirs) == 0:
|
17 |
+
raise Exception("No speakers found. Make sure you are pointing to the directory "
|
18 |
+
"containing all preprocessed speaker directories.")
|
19 |
+
self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
|
20 |
+
self.speaker_cycler = RandomCycler(self.speakers)
|
21 |
+
|
22 |
+
def __len__(self):
|
23 |
+
return int(1e10)
|
24 |
+
|
25 |
+
def __getitem__(self, index):
|
26 |
+
return next(self.speaker_cycler)
|
27 |
+
|
28 |
+
def get_logs(self):
|
29 |
+
log_string = ""
|
30 |
+
for log_fpath in self.root.glob("*.txt"):
|
31 |
+
with log_fpath.open("r") as log_file:
|
32 |
+
log_string += "".join(log_file.readlines())
|
33 |
+
return log_string
|
34 |
+
|
35 |
+
|
36 |
+
class SpeakerVerificationDataLoader(DataLoader):
|
37 |
+
def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
|
38 |
+
batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
|
39 |
+
worker_init_fn=None):
|
40 |
+
self.utterances_per_speaker = utterances_per_speaker
|
41 |
+
|
42 |
+
super().__init__(
|
43 |
+
dataset=dataset,
|
44 |
+
batch_size=speakers_per_batch,
|
45 |
+
shuffle=False,
|
46 |
+
sampler=sampler,
|
47 |
+
batch_sampler=batch_sampler,
|
48 |
+
num_workers=num_workers,
|
49 |
+
collate_fn=self.collate,
|
50 |
+
pin_memory=pin_memory,
|
51 |
+
drop_last=False,
|
52 |
+
timeout=timeout,
|
53 |
+
worker_init_fn=worker_init_fn
|
54 |
+
)
|
55 |
+
|
56 |
+
def collate(self, speakers):
|
57 |
+
return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)
|
58 |
+
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/utterance.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
class Utterance:
|
7 |
+
def __init__(self, frames_fpath, wave_fpath):
|
8 |
+
self.frames_fpath = frames_fpath
|
9 |
+
self.wave_fpath = wave_fpath
|
10 |
+
|
11 |
+
def get_frames(self):
|
12 |
+
return np.load(self.frames_fpath)
|
13 |
+
|
14 |
+
def random_partial(self, n_frames):
|
15 |
+
"""
|
16 |
+
Crops the frames into a partial utterance of n_frames
|
17 |
+
|
18 |
+
:param n_frames: The number of frames of the partial utterance
|
19 |
+
:return: the partial utterance frames and a tuple indicating the start and end of the
|
20 |
+
partial utterance in the complete utterance.
|
21 |
+
"""
|
22 |
+
frames = self.get_frames()
|
23 |
+
if frames.shape[0] == n_frames:
|
24 |
+
start = 0
|
25 |
+
else:
|
26 |
+
start = np.random.randint(0, frames.shape[0] - n_frames)
|
27 |
+
end = start + n_frames
|
28 |
+
return frames[start:end], (start, end)
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/inference.py
ADDED
@@ -0,0 +1,211 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
from .params_data import *
|
4 |
+
from .model import SpeakerEncoder
|
5 |
+
from .audio import preprocess_wav, preprocess_wav_batch, wav_to_mel_spectrogram_batch, wav_to_mel_spectrogram
|
6 |
+
from matplotlib import cm
|
7 |
+
from pathlib import Path
|
8 |
+
import matplotlib.pyplot as plt
|
9 |
+
import numpy as np
|
10 |
+
import torch
|
11 |
+
|
12 |
+
_model = None # type: SpeakerEncoder
|
13 |
+
_device = None # type: torch.device
|
14 |
+
|
15 |
+
|
16 |
+
def load_model(weights_fpath: Path, device="cpu"):
|
17 |
+
"""
|
18 |
+
Loads the model in memory. If this function is not explicitely called, it will be run on the
|
19 |
+
first call to embed_frames() with the default weights file.
|
20 |
+
|
21 |
+
:param weights_fpath: the path to saved model weights.
|
22 |
+
:param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
|
23 |
+
model will be loaded and will run on this device. Outputs will however always be on the cpu.
|
24 |
+
If None, will default to your GPU if it"s available, otherwise your CPU.
|
25 |
+
"""
|
26 |
+
# TODO: I think the slow loading of the encoder might have something to do with the device it
|
27 |
+
# was saved on. Worth investigating.
|
28 |
+
global _model, _device
|
29 |
+
if device is None:
|
30 |
+
_device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
31 |
+
elif isinstance(device, str):
|
32 |
+
_device = torch.device(device)
|
33 |
+
_model = SpeakerEncoder(_device, torch.device("cpu"))
|
34 |
+
checkpoint = torch.load(weights_fpath, map_location="cpu")
|
35 |
+
_model.load_state_dict(checkpoint["model_state"])
|
36 |
+
_model.eval()
|
37 |
+
_model = _model.to(device)
|
38 |
+
print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
|
39 |
+
|
40 |
+
|
41 |
+
def is_loaded():
|
42 |
+
return _model is not None
|
43 |
+
|
44 |
+
|
45 |
+
@torch.no_grad()
|
46 |
+
def embed_frames_batch(frames, use_torch=False):
|
47 |
+
if _model is None:
|
48 |
+
raise Exception("Model was not loaded. Call load_model() before inference.")
|
49 |
+
|
50 |
+
if not use_torch:
|
51 |
+
frames = torch.from_numpy(frames)
|
52 |
+
frames = frames.to(_device)
|
53 |
+
|
54 |
+
embeds = _model.forward(frames)
|
55 |
+
if not use_torch:
|
56 |
+
embeds = embeds.detach().cpu().numpy()
|
57 |
+
return embeds
|
58 |
+
|
59 |
+
|
60 |
+
def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
|
61 |
+
min_pad_coverage=0.75, overlap=0.5):
|
62 |
+
"""
|
63 |
+
Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
|
64 |
+
partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
|
65 |
+
spectrogram slices are returned, so as to make each partial utterance waveform correspond to
|
66 |
+
its spectrogram. This function assumes that the mel spectrogram parameters used are those
|
67 |
+
defined in params_data.py.
|
68 |
+
|
69 |
+
The returned ranges may be indexing further than the length of the waveform. It is
|
70 |
+
recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
|
71 |
+
|
72 |
+
:param n_samples: the number of samples in the waveform
|
73 |
+
:param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
|
74 |
+
utterance
|
75 |
+
:param min_pad_coverage: when reaching the last partial utterance, it may or may not have
|
76 |
+
enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
|
77 |
+
then the last partial utterance will be considered, as if we padded the audio. Otherwise,
|
78 |
+
it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
|
79 |
+
utterance, this parameter is ignored so that the function always returns at least 1 slice.
|
80 |
+
:param overlap: by how much the partial utterance should overlap. If set to 0, the partial
|
81 |
+
utterances are entirely disjoint.
|
82 |
+
:return: the waveform slices and mel spectrogram slices as lists of array slices. Index
|
83 |
+
respectively the waveform and the mel spectrogram with these slices to obtain the partial
|
84 |
+
utterances.
|
85 |
+
"""
|
86 |
+
assert 0 <= overlap < 1
|
87 |
+
assert 0 < min_pad_coverage <= 1
|
88 |
+
|
89 |
+
samples_per_frame = int((sampling_rate * mel_window_step / 1000))
|
90 |
+
n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
|
91 |
+
frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
|
92 |
+
|
93 |
+
# Compute the slices
|
94 |
+
wav_slices, mel_slices = [], []
|
95 |
+
steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
|
96 |
+
for i in range(0, steps, frame_step):
|
97 |
+
mel_range = np.array([i, i + partial_utterance_n_frames])
|
98 |
+
wav_range = mel_range * samples_per_frame
|
99 |
+
mel_slices.append(slice(*mel_range))
|
100 |
+
wav_slices.append(slice(*wav_range))
|
101 |
+
|
102 |
+
# Evaluate whether extra padding is warranted or not
|
103 |
+
last_wav_range = wav_slices[-1]
|
104 |
+
coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
|
105 |
+
if coverage < min_pad_coverage and len(mel_slices) > 1:
|
106 |
+
mel_slices = mel_slices[:-1]
|
107 |
+
wav_slices = wav_slices[:-1]
|
108 |
+
|
109 |
+
return wav_slices, mel_slices
|
110 |
+
|
111 |
+
|
112 |
+
@torch.no_grad()
|
113 |
+
def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
|
114 |
+
"""
|
115 |
+
Computes an embedding for a single utterance.
|
116 |
+
|
117 |
+
# TODO: handle multiple wavs to benefit from batching on GPU
|
118 |
+
:param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
|
119 |
+
:param using_partials: if True, then the utterance is split in partial utterances of
|
120 |
+
<partial_utterance_n_frames> frames and the utterance embedding is computed from their
|
121 |
+
normalized average. If False, the utterance is instead computed from feeding the entire
|
122 |
+
spectogram to the network.
|
123 |
+
:param return_partials: if True, the partial embeddings will also be returned along with the
|
124 |
+
wav slices that correspond to the partial embeddings.
|
125 |
+
:param kwargs: additional arguments to compute_partial_splits()
|
126 |
+
:return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
|
127 |
+
<return_partials> is True, the partial utterances as a numpy array of float32 of shape
|
128 |
+
(n_partials, model_embedding_size) and the wav partials as a list of slices will also be
|
129 |
+
returned. If <using_partials> is simultaneously set to False, both these values will be None
|
130 |
+
instead.
|
131 |
+
"""
|
132 |
+
# Process the entire utterance if not using partials
|
133 |
+
if not using_partials:
|
134 |
+
frames = wav_to_mel_spectrogram(wav)
|
135 |
+
embed = embed_frames_batch(frames[None, ...])[0]
|
136 |
+
if return_partials:
|
137 |
+
return embed, None, None
|
138 |
+
return embed
|
139 |
+
|
140 |
+
# Compute where to split the utterance into partials and pad if necessary
|
141 |
+
wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
|
142 |
+
max_wave_length = wave_slices[-1].stop
|
143 |
+
if max_wave_length >= len(wav):
|
144 |
+
wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
|
145 |
+
|
146 |
+
# Split the utterance into partials
|
147 |
+
frames = wav_to_mel_spectrogram(wav)
|
148 |
+
frames_batch = np.array([frames[s] for s in mel_slices])
|
149 |
+
partial_embeds = embed_frames_batch(frames_batch)
|
150 |
+
|
151 |
+
# Compute the utterance embedding from the partial embeddings
|
152 |
+
raw_embed = np.mean(partial_embeds, axis=0)
|
153 |
+
embed = raw_embed / np.linalg.norm(raw_embed, 2)
|
154 |
+
|
155 |
+
if return_partials:
|
156 |
+
return embed, partial_embeds, wave_slices
|
157 |
+
return embed
|
158 |
+
|
159 |
+
|
160 |
+
@torch.no_grad()
|
161 |
+
def embed_utterance_batch(wavs, using_partials=True, return_partials=False, **kwargs):
|
162 |
+
# This torch version is designed to cope with a batch of same lengths wavs
|
163 |
+
if not using_partials:
|
164 |
+
frames = wav_to_mel_spectrogram_batch(wavs)
|
165 |
+
embeds = embed_frames_batch(frames)
|
166 |
+
if return_partials:
|
167 |
+
return embeds, None, None
|
168 |
+
return embeds
|
169 |
+
|
170 |
+
wave_slices, mel_slices = compute_partial_slices(wavs.shape[-1], **kwargs)
|
171 |
+
max_wave_length = wave_slices[-1].stop
|
172 |
+
if max_wave_length >= wavs.shape[-1]:
|
173 |
+
wavs = torch.cat([wavs, torch.ones((wavs.shape[0], max_wave_length - wavs.shape[-1]),
|
174 |
+
dtype=wavs.dtype, device=wavs.device)], 1)
|
175 |
+
|
176 |
+
frames = wav_to_mel_spectrogram_batch(wavs)
|
177 |
+
frames_batch = []
|
178 |
+
for i in range(len(frames)):
|
179 |
+
frames_batch += [frames[i][s] for s in mel_slices]
|
180 |
+
frames_batch = torch.stack(frames_batch, 0)
|
181 |
+
partial_embeds = embed_frames_batch(frames_batch, use_torch=True)
|
182 |
+
partial_embeds = partial_embeds.view(wavs.shape[0], len(mel_slices), -1)
|
183 |
+
|
184 |
+
raw_embeds = torch.mean(partial_embeds, axis=1, keepdims=False)
|
185 |
+
embeds = raw_embeds / torch.linalg.norm(raw_embeds, axis=-1, keepdims=True)
|
186 |
+
|
187 |
+
if return_partials:
|
188 |
+
return embeds, partial_embeds, wave_slices
|
189 |
+
return embeds
|
190 |
+
|
191 |
+
|
192 |
+
def embed_speaker(wavs, **kwargs):
|
193 |
+
raise NotImplemented()
|
194 |
+
|
195 |
+
|
196 |
+
def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
|
197 |
+
if ax is None:
|
198 |
+
ax = plt.gca()
|
199 |
+
|
200 |
+
if shape is None:
|
201 |
+
height = int(np.sqrt(len(embed)))
|
202 |
+
shape = (height, -1)
|
203 |
+
embed = embed.reshape(shape)
|
204 |
+
|
205 |
+
cmap = cm.get_cmap()
|
206 |
+
mappable = ax.imshow(embed, cmap=cmap)
|
207 |
+
cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
|
208 |
+
cbar.set_clim(*color_range)
|
209 |
+
|
210 |
+
ax.set_xticks([]), ax.set_yticks([])
|
211 |
+
ax.set_title(title)
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/model.py
ADDED
@@ -0,0 +1,137 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
from .params_model import *
|
4 |
+
from .params_data import *
|
5 |
+
from scipy.interpolate import interp1d
|
6 |
+
from sklearn.metrics import roc_curve
|
7 |
+
from torch.nn.utils import clip_grad_norm_
|
8 |
+
from scipy.optimize import brentq
|
9 |
+
from torch import nn
|
10 |
+
import numpy as np
|
11 |
+
import torch
|
12 |
+
|
13 |
+
|
14 |
+
class SpeakerEncoder(nn.Module):
|
15 |
+
def __init__(self, device, loss_device):
|
16 |
+
super().__init__()
|
17 |
+
self.loss_device = loss_device
|
18 |
+
|
19 |
+
# Network defition
|
20 |
+
self.lstm = nn.LSTM(input_size=mel_n_channels,
|
21 |
+
hidden_size=model_hidden_size,
|
22 |
+
num_layers=model_num_layers,
|
23 |
+
batch_first=True).to(device)
|
24 |
+
self.linear = nn.Linear(in_features=model_hidden_size,
|
25 |
+
out_features=model_embedding_size).to(device)
|
26 |
+
self.relu = torch.nn.ReLU().to(device)
|
27 |
+
|
28 |
+
# Cosine similarity scaling (with fixed initial parameter values)
|
29 |
+
self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
|
30 |
+
self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
|
31 |
+
|
32 |
+
# Loss
|
33 |
+
self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
|
34 |
+
|
35 |
+
def do_gradient_ops(self):
|
36 |
+
# Gradient scale
|
37 |
+
self.similarity_weight.grad *= 0.01
|
38 |
+
self.similarity_bias.grad *= 0.01
|
39 |
+
|
40 |
+
# Gradient clipping
|
41 |
+
clip_grad_norm_(self.parameters(), 3, norm_type=2)
|
42 |
+
|
43 |
+
def forward(self, utterances, hidden_init=None):
|
44 |
+
"""
|
45 |
+
Computes the embeddings of a batch of utterance spectrograms.
|
46 |
+
|
47 |
+
:param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
|
48 |
+
(batch_size, n_frames, n_channels)
|
49 |
+
:param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
|
50 |
+
batch_size, hidden_size). Will default to a tensor of zeros if None.
|
51 |
+
:return: the embeddings as a tensor of shape (batch_size, embedding_size)
|
52 |
+
"""
|
53 |
+
# Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
|
54 |
+
# and the final cell state.
|
55 |
+
out, (hidden, cell) = self.lstm(utterances, hidden_init)
|
56 |
+
|
57 |
+
# We take only the hidden state of the last layer
|
58 |
+
embeds_raw = self.relu(self.linear(hidden[-1]))
|
59 |
+
|
60 |
+
# L2-normalize it
|
61 |
+
embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
|
62 |
+
|
63 |
+
return embeds
|
64 |
+
|
65 |
+
def similarity_matrix(self, embeds):
|
66 |
+
"""
|
67 |
+
Computes the similarity matrix according the section 2.1 of GE2E.
|
68 |
+
|
69 |
+
:param embeds: the embeddings as a tensor of shape (speakers_per_batch,
|
70 |
+
utterances_per_speaker, embedding_size)
|
71 |
+
:return: the similarity matrix as a tensor of shape (speakers_per_batch,
|
72 |
+
utterances_per_speaker, speakers_per_batch)
|
73 |
+
"""
|
74 |
+
speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
|
75 |
+
|
76 |
+
# Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
|
77 |
+
centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
|
78 |
+
centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
|
79 |
+
|
80 |
+
# Exclusive centroids (1 per utterance)
|
81 |
+
centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
|
82 |
+
centroids_excl /= (utterances_per_speaker - 1)
|
83 |
+
centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
|
84 |
+
|
85 |
+
# Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
|
86 |
+
# product of these vectors (which is just an element-wise multiplication reduced by a sum).
|
87 |
+
# We vectorize the computation for efficiency.
|
88 |
+
sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
|
89 |
+
speakers_per_batch).to(self.loss_device)
|
90 |
+
mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
|
91 |
+
for j in range(speakers_per_batch):
|
92 |
+
mask = np.where(mask_matrix[j])[0]
|
93 |
+
sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
|
94 |
+
sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
|
95 |
+
|
96 |
+
## Even more vectorized version (slower maybe because of transpose)
|
97 |
+
# sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
|
98 |
+
# ).to(self.loss_device)
|
99 |
+
# eye = np.eye(speakers_per_batch, dtype=np.int)
|
100 |
+
# mask = np.where(1 - eye)
|
101 |
+
# sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
|
102 |
+
# mask = np.where(eye)
|
103 |
+
# sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
|
104 |
+
# sim_matrix2 = sim_matrix2.transpose(1, 2)
|
105 |
+
|
106 |
+
sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
|
107 |
+
return sim_matrix
|
108 |
+
|
109 |
+
def loss(self, embeds):
|
110 |
+
"""
|
111 |
+
Computes the softmax loss according the section 2.1 of GE2E.
|
112 |
+
|
113 |
+
:param embeds: the embeddings as a tensor of shape (speakers_per_batch,
|
114 |
+
utterances_per_speaker, embedding_size)
|
115 |
+
:return: the loss and the EER for this batch of embeddings.
|
116 |
+
"""
|
117 |
+
speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
|
118 |
+
|
119 |
+
# Loss
|
120 |
+
sim_matrix = self.similarity_matrix(embeds)
|
121 |
+
sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
|
122 |
+
speakers_per_batch))
|
123 |
+
ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
|
124 |
+
target = torch.from_numpy(ground_truth).long().to(self.loss_device)
|
125 |
+
loss = self.loss_fn(sim_matrix, target)
|
126 |
+
|
127 |
+
# EER (not backpropagated)
|
128 |
+
with torch.no_grad():
|
129 |
+
inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
|
130 |
+
labels = np.array([inv_argmax(i) for i in ground_truth])
|
131 |
+
preds = sim_matrix.detach().cpu().numpy()
|
132 |
+
|
133 |
+
# Snippet from https://yangcha.github.io/EER-ROC/
|
134 |
+
fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
|
135 |
+
eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
|
136 |
+
|
137 |
+
return loss, eer
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/params_data.py
ADDED
@@ -0,0 +1,30 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
## Mel-filterbank
|
4 |
+
mel_window_length = 25 # In milliseconds
|
5 |
+
mel_window_step = 10 # In milliseconds
|
6 |
+
mel_n_channels = 40
|
7 |
+
|
8 |
+
|
9 |
+
## Audio
|
10 |
+
sampling_rate = 16000
|
11 |
+
# Number of spectrogram frames in a partial utterance
|
12 |
+
partials_n_frames = 160 # 1600 ms
|
13 |
+
# Number of spectrogram frames at inference
|
14 |
+
inference_n_frames = 80 # 800 ms
|
15 |
+
|
16 |
+
|
17 |
+
## Voice Activation Detection
|
18 |
+
# Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
|
19 |
+
# This sets the granularity of the VAD. Should not need to be changed.
|
20 |
+
vad_window_length = 30 # In milliseconds
|
21 |
+
# Number of frames to average together when performing the moving average smoothing.
|
22 |
+
# The larger this value, the larger the VAD variations must be to not get smoothed out.
|
23 |
+
vad_moving_average_width = 8
|
24 |
+
# Maximum number of consecutive silent frames a segment can have.
|
25 |
+
vad_max_silence_length = 6
|
26 |
+
|
27 |
+
|
28 |
+
## Audio volume normalization
|
29 |
+
audio_norm_target_dBFS = -30
|
30 |
+
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/params_model.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
## Model parameters
|
4 |
+
model_hidden_size = 256
|
5 |
+
model_embedding_size = 256
|
6 |
+
model_num_layers = 3
|
7 |
+
|
8 |
+
|
9 |
+
## Training parameters
|
10 |
+
learning_rate_init = 1e-4
|
11 |
+
speakers_per_batch = 64
|
12 |
+
utterances_per_speaker = 10
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/preprocess.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
from multiprocess.pool import ThreadPool
|
4 |
+
from .params_data import *
|
5 |
+
from .config import librispeech_datasets, anglophone_nationalites
|
6 |
+
from datetime import datetime
|
7 |
+
from .audio import preprocess_wav, wav_to_mel_spectrogram, preprocess_wav_batch, wav_to_mel_spectrogram_batch
|
8 |
+
from pathlib import Path
|
9 |
+
from tqdm import tqdm
|
10 |
+
import numpy as np
|
11 |
+
|
12 |
+
|
13 |
+
class DatasetLog:
|
14 |
+
"""
|
15 |
+
Registers metadata about the dataset in a text file.
|
16 |
+
"""
|
17 |
+
def __init__(self, root, name):
|
18 |
+
self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
|
19 |
+
self.sample_data = dict()
|
20 |
+
|
21 |
+
start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
|
22 |
+
self.write_line("Creating dataset %s on %s" % (name, start_time))
|
23 |
+
self.write_line("-----")
|
24 |
+
self._log_params()
|
25 |
+
|
26 |
+
def _log_params(self):
|
27 |
+
from encoder import params_data
|
28 |
+
self.write_line("Parameter values:")
|
29 |
+
for param_name in (p for p in dir(params_data) if not p.startswith("__")):
|
30 |
+
value = getattr(params_data, param_name)
|
31 |
+
self.write_line("\t%s: %s" % (param_name, value))
|
32 |
+
self.write_line("-----")
|
33 |
+
|
34 |
+
def write_line(self, line):
|
35 |
+
self.text_file.write("%s\n" % line)
|
36 |
+
|
37 |
+
def add_sample(self, **kwargs):
|
38 |
+
for param_name, value in kwargs.items():
|
39 |
+
if not param_name in self.sample_data:
|
40 |
+
self.sample_data[param_name] = []
|
41 |
+
self.sample_data[param_name].append(value)
|
42 |
+
|
43 |
+
def finalize(self):
|
44 |
+
self.write_line("Statistics:")
|
45 |
+
for param_name, values in self.sample_data.items():
|
46 |
+
self.write_line("\t%s:" % param_name)
|
47 |
+
self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
|
48 |
+
self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
|
49 |
+
self.write_line("-----")
|
50 |
+
end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
|
51 |
+
self.write_line("Finished on %s" % end_time)
|
52 |
+
self.text_file.close()
|
53 |
+
|
54 |
+
|
55 |
+
def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
|
56 |
+
dataset_root = datasets_root.joinpath(dataset_name)
|
57 |
+
if not dataset_root.exists():
|
58 |
+
print("Couldn\'t find %s, skipping this dataset." % dataset_root)
|
59 |
+
return None, None
|
60 |
+
return dataset_root, DatasetLog(out_dir, dataset_name)
|
61 |
+
|
62 |
+
|
63 |
+
def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
|
64 |
+
skip_existing, logger):
|
65 |
+
print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
|
66 |
+
|
67 |
+
# Function to preprocess utterances for one speaker
|
68 |
+
def preprocess_speaker(speaker_dir: Path):
|
69 |
+
# Give a name to the speaker that includes its dataset
|
70 |
+
speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
|
71 |
+
|
72 |
+
# Create an output directory with that name, as well as a txt file containing a
|
73 |
+
# reference to each source file.
|
74 |
+
speaker_out_dir = out_dir.joinpath(speaker_name)
|
75 |
+
speaker_out_dir.mkdir(exist_ok=True)
|
76 |
+
sources_fpath = speaker_out_dir.joinpath("_sources.txt")
|
77 |
+
|
78 |
+
# There's a possibility that the preprocessing was interrupted earlier, check if
|
79 |
+
# there already is a sources file.
|
80 |
+
if sources_fpath.exists():
|
81 |
+
try:
|
82 |
+
with sources_fpath.open("r") as sources_file:
|
83 |
+
existing_fnames = {line.split(",")[0] for line in sources_file}
|
84 |
+
except:
|
85 |
+
existing_fnames = {}
|
86 |
+
else:
|
87 |
+
existing_fnames = {}
|
88 |
+
|
89 |
+
# Gather all audio files for that speaker recursively
|
90 |
+
sources_file = sources_fpath.open("a" if skip_existing else "w")
|
91 |
+
for in_fpath in speaker_dir.glob("**/*.%s" % extension):
|
92 |
+
# Check if the target output file already exists
|
93 |
+
out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
|
94 |
+
out_fname = out_fname.replace(".%s" % extension, ".npy")
|
95 |
+
if skip_existing and out_fname in existing_fnames:
|
96 |
+
continue
|
97 |
+
|
98 |
+
# Load and preprocess the waveform
|
99 |
+
wav = preprocess_wav(in_fpath)
|
100 |
+
if len(wav) == 0:
|
101 |
+
continue
|
102 |
+
|
103 |
+
# Create the mel spectrogram, discard those that are too short
|
104 |
+
frames = wav_to_mel_spectrogram(wav)
|
105 |
+
if len(frames) < partials_n_frames:
|
106 |
+
continue
|
107 |
+
|
108 |
+
out_fpath = speaker_out_dir.joinpath(out_fname)
|
109 |
+
np.save(out_fpath, frames)
|
110 |
+
logger.add_sample(duration=len(wav) / sampling_rate)
|
111 |
+
sources_file.write("%s,%s\n" % (out_fname, in_fpath))
|
112 |
+
|
113 |
+
sources_file.close()
|
114 |
+
|
115 |
+
# Process the utterances for each speaker
|
116 |
+
with ThreadPool(8) as pool:
|
117 |
+
list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
|
118 |
+
unit="speakers"))
|
119 |
+
logger.finalize()
|
120 |
+
print("Done preprocessing %s.\n" % dataset_name)
|
121 |
+
|
122 |
+
|
123 |
+
def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
|
124 |
+
for dataset_name in librispeech_datasets["train"]["other"]:
|
125 |
+
# Initialize the preprocessing
|
126 |
+
dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
|
127 |
+
if not dataset_root:
|
128 |
+
return
|
129 |
+
|
130 |
+
# Preprocess all speakers
|
131 |
+
speaker_dirs = list(dataset_root.glob("*"))
|
132 |
+
_preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
|
133 |
+
skip_existing, logger)
|
134 |
+
|
135 |
+
|
136 |
+
def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
|
137 |
+
# Initialize the preprocessing
|
138 |
+
dataset_name = "VoxCeleb1"
|
139 |
+
dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
|
140 |
+
if not dataset_root:
|
141 |
+
return
|
142 |
+
|
143 |
+
# Get the contents of the meta file
|
144 |
+
with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
|
145 |
+
metadata = [line.split("\t") for line in metafile][1:]
|
146 |
+
|
147 |
+
# Select the ID and the nationality, filter out non-anglophone speakers
|
148 |
+
nationalities = {line[0]: line[3] for line in metadata}
|
149 |
+
keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
|
150 |
+
nationality.lower() in anglophone_nationalites]
|
151 |
+
print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
|
152 |
+
(len(keep_speaker_ids), len(nationalities)))
|
153 |
+
|
154 |
+
# Get the speaker directories for anglophone speakers only
|
155 |
+
speaker_dirs = dataset_root.joinpath("wav").glob("*")
|
156 |
+
speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
|
157 |
+
speaker_dir.name in keep_speaker_ids]
|
158 |
+
print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
|
159 |
+
(len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
|
160 |
+
|
161 |
+
# Preprocess all speakers
|
162 |
+
_preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
|
163 |
+
skip_existing, logger)
|
164 |
+
|
165 |
+
|
166 |
+
def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
|
167 |
+
# Initialize the preprocessing
|
168 |
+
dataset_name = "VoxCeleb2"
|
169 |
+
dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
|
170 |
+
if not dataset_root:
|
171 |
+
return
|
172 |
+
|
173 |
+
# Get the speaker directories
|
174 |
+
# Preprocess all speakers
|
175 |
+
speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
|
176 |
+
_preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
|
177 |
+
skip_existing, logger)
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/train.py
ADDED
@@ -0,0 +1,127 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
from .visualizations import Visualizations
|
4 |
+
from .data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
|
5 |
+
from .params_model import *
|
6 |
+
from .model import SpeakerEncoder
|
7 |
+
from .utils.profiler import Profiler
|
8 |
+
from pathlib import Path
|
9 |
+
import torch
|
10 |
+
|
11 |
+
def sync(device: torch.device):
|
12 |
+
# FIXME
|
13 |
+
return
|
14 |
+
# For correct profiling (cuda operations are async)
|
15 |
+
if device.type == "cuda":
|
16 |
+
torch.cuda.synchronize(device)
|
17 |
+
|
18 |
+
def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
|
19 |
+
backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
|
20 |
+
no_visdom: bool):
|
21 |
+
# Create a dataset and a dataloader
|
22 |
+
dataset = SpeakerVerificationDataset(clean_data_root)
|
23 |
+
loader = SpeakerVerificationDataLoader(
|
24 |
+
dataset,
|
25 |
+
speakers_per_batch,
|
26 |
+
utterances_per_speaker,
|
27 |
+
num_workers=8,
|
28 |
+
)
|
29 |
+
|
30 |
+
# Setup the device on which to run the forward pass and the loss. These can be different,
|
31 |
+
# because the forward pass is faster on the GPU whereas the loss is often (depending on your
|
32 |
+
# hyperparameters) faster on the CPU.
|
33 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
34 |
+
# FIXME: currently, the gradient is None if loss_device is cuda
|
35 |
+
loss_device = torch.device("cpu")
|
36 |
+
|
37 |
+
# Create the model and the optimizer
|
38 |
+
model = SpeakerEncoder(device, loss_device)
|
39 |
+
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
|
40 |
+
init_step = 1
|
41 |
+
|
42 |
+
# Configure file path for the model
|
43 |
+
state_fpath = models_dir.joinpath(run_id + ".pt")
|
44 |
+
backup_dir = models_dir.joinpath(run_id + "_backups")
|
45 |
+
|
46 |
+
# Load any existing model
|
47 |
+
if not force_restart:
|
48 |
+
if state_fpath.exists():
|
49 |
+
print("Found existing model \"%s\", loading it and resuming training." % run_id)
|
50 |
+
checkpoint = torch.load(state_fpath)
|
51 |
+
init_step = checkpoint["step"]
|
52 |
+
model.load_state_dict(checkpoint["model_state"])
|
53 |
+
optimizer.load_state_dict(checkpoint["optimizer_state"])
|
54 |
+
optimizer.param_groups[0]["lr"] = learning_rate_init
|
55 |
+
else:
|
56 |
+
print("No model \"%s\" found, starting training from scratch." % run_id)
|
57 |
+
else:
|
58 |
+
print("Starting the training from scratch.")
|
59 |
+
model.train()
|
60 |
+
|
61 |
+
# Initialize the visualization environment
|
62 |
+
vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
|
63 |
+
vis.log_dataset(dataset)
|
64 |
+
vis.log_params()
|
65 |
+
device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
|
66 |
+
vis.log_implementation({"Device": device_name})
|
67 |
+
|
68 |
+
# Training loop
|
69 |
+
profiler = Profiler(summarize_every=10, disabled=False)
|
70 |
+
for step, speaker_batch in enumerate(loader, init_step):
|
71 |
+
profiler.tick("Blocking, waiting for batch (threaded)")
|
72 |
+
|
73 |
+
# Forward pass
|
74 |
+
inputs = torch.from_numpy(speaker_batch.data).to(device)
|
75 |
+
sync(device)
|
76 |
+
profiler.tick("Data to %s" % device)
|
77 |
+
embeds = model(inputs)
|
78 |
+
sync(device)
|
79 |
+
profiler.tick("Forward pass")
|
80 |
+
embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
|
81 |
+
loss, eer = model.loss(embeds_loss)
|
82 |
+
sync(loss_device)
|
83 |
+
profiler.tick("Loss")
|
84 |
+
|
85 |
+
# Backward pass
|
86 |
+
model.zero_grad()
|
87 |
+
loss.backward()
|
88 |
+
profiler.tick("Backward pass")
|
89 |
+
model.do_gradient_ops()
|
90 |
+
optimizer.step()
|
91 |
+
profiler.tick("Parameter update")
|
92 |
+
|
93 |
+
# Update visualizations
|
94 |
+
# learning_rate = optimizer.param_groups[0]["lr"]
|
95 |
+
vis.update(loss.item(), eer, step)
|
96 |
+
|
97 |
+
# Draw projections and save them to the backup folder
|
98 |
+
if umap_every != 0 and step % umap_every == 0:
|
99 |
+
print("Drawing and saving projections (step %d)" % step)
|
100 |
+
backup_dir.mkdir(exist_ok=True)
|
101 |
+
projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
|
102 |
+
embeds = embeds.detach().cpu().numpy()
|
103 |
+
vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
|
104 |
+
vis.save()
|
105 |
+
|
106 |
+
# Overwrite the latest version of the model
|
107 |
+
if save_every != 0 and step % save_every == 0:
|
108 |
+
print("Saving the model (step %d)" % step)
|
109 |
+
torch.save({
|
110 |
+
"step": step + 1,
|
111 |
+
"model_state": model.state_dict(),
|
112 |
+
"optimizer_state": optimizer.state_dict(),
|
113 |
+
}, state_fpath)
|
114 |
+
|
115 |
+
# Make a backup
|
116 |
+
if backup_every != 0 and step % backup_every == 0:
|
117 |
+
print("Making a backup (step %d)" % step)
|
118 |
+
backup_dir.mkdir(exist_ok=True)
|
119 |
+
backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
|
120 |
+
torch.save({
|
121 |
+
"step": step + 1,
|
122 |
+
"model_state": model.state_dict(),
|
123 |
+
"optimizer_state": optimizer.state_dict(),
|
124 |
+
}, backup_fpath)
|
125 |
+
|
126 |
+
profiler.tick("Extras (visualizations, saving)")
|
127 |
+
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/argutils.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
from pathlib import Path
|
4 |
+
import numpy as np
|
5 |
+
import argparse
|
6 |
+
|
7 |
+
_type_priorities = [ # In decreasing order
|
8 |
+
Path,
|
9 |
+
str,
|
10 |
+
int,
|
11 |
+
float,
|
12 |
+
bool,
|
13 |
+
]
|
14 |
+
|
15 |
+
def _priority(o):
|
16 |
+
p = next((i for i, t in enumerate(_type_priorities) if type(o) is t), None)
|
17 |
+
if p is not None:
|
18 |
+
return p
|
19 |
+
p = next((i for i, t in enumerate(_type_priorities) if isinstance(o, t)), None)
|
20 |
+
if p is not None:
|
21 |
+
return p
|
22 |
+
return len(_type_priorities)
|
23 |
+
|
24 |
+
def print_args(args: argparse.Namespace, parser=None):
|
25 |
+
args = vars(args)
|
26 |
+
if parser is None:
|
27 |
+
priorities = list(map(_priority, args.values()))
|
28 |
+
else:
|
29 |
+
all_params = [a.dest for g in parser._action_groups for a in g._group_actions ]
|
30 |
+
priority = lambda p: all_params.index(p) if p in all_params else len(all_params)
|
31 |
+
priorities = list(map(priority, args.keys()))
|
32 |
+
|
33 |
+
pad = max(map(len, args.keys())) + 3
|
34 |
+
indices = np.lexsort((list(args.keys()), priorities))
|
35 |
+
items = list(args.items())
|
36 |
+
|
37 |
+
print("Arguments:")
|
38 |
+
for i in indices:
|
39 |
+
param, value = items[i]
|
40 |
+
print(" {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value))
|
41 |
+
print("")
|
42 |
+
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/logmmse.py
ADDED
@@ -0,0 +1,222 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
import numpy as np
|
4 |
+
import math
|
5 |
+
from scipy.special import expn
|
6 |
+
from collections import namedtuple
|
7 |
+
|
8 |
+
NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2")
|
9 |
+
|
10 |
+
|
11 |
+
def profile_noise(noise, sampling_rate, window_size=0):
|
12 |
+
"""
|
13 |
+
Creates a profile of the noise in a given waveform.
|
14 |
+
|
15 |
+
:param noise: a waveform containing noise ONLY, as a numpy array of floats or ints.
|
16 |
+
:param sampling_rate: the sampling rate of the audio
|
17 |
+
:param window_size: the size of the window the logmmse algorithm operates on. A default value
|
18 |
+
will be picked if left as 0.
|
19 |
+
:return: a NoiseProfile object
|
20 |
+
"""
|
21 |
+
noise, dtype = to_float(noise)
|
22 |
+
noise += np.finfo(np.float64).eps
|
23 |
+
|
24 |
+
if window_size == 0:
|
25 |
+
window_size = int(math.floor(0.02 * sampling_rate))
|
26 |
+
|
27 |
+
if window_size % 2 == 1:
|
28 |
+
window_size = window_size + 1
|
29 |
+
|
30 |
+
perc = 50
|
31 |
+
len1 = int(math.floor(window_size * perc / 100))
|
32 |
+
len2 = int(window_size - len1)
|
33 |
+
|
34 |
+
win = np.hanning(window_size)
|
35 |
+
win = win * len2 / np.sum(win)
|
36 |
+
n_fft = 2 * window_size
|
37 |
+
|
38 |
+
noise_mean = np.zeros(n_fft)
|
39 |
+
n_frames = len(noise) // window_size
|
40 |
+
for j in range(0, window_size * n_frames, window_size):
|
41 |
+
noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0))
|
42 |
+
noise_mu2 = (noise_mean / n_frames) ** 2
|
43 |
+
|
44 |
+
return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2)
|
45 |
+
|
46 |
+
|
47 |
+
def denoise(wav, noise_profile: NoiseProfile, eta=0.15):
|
48 |
+
"""
|
49 |
+
Cleans the noise from a speech waveform given a noise profile. The waveform must have the
|
50 |
+
same sampling rate as the one used to create the noise profile.
|
51 |
+
|
52 |
+
:param wav: a speech waveform as a numpy array of floats or ints.
|
53 |
+
:param noise_profile: a NoiseProfile object that was created from a similar (or a segment of
|
54 |
+
the same) waveform.
|
55 |
+
:param eta: voice threshold for noise update. While the voice activation detection value is
|
56 |
+
below this threshold, the noise profile will be continuously updated throughout the audio.
|
57 |
+
Set to 0 to disable updating the noise profile.
|
58 |
+
:return: the clean wav as a numpy array of floats or ints of the same length.
|
59 |
+
"""
|
60 |
+
wav, dtype = to_float(wav)
|
61 |
+
wav += np.finfo(np.float64).eps
|
62 |
+
p = noise_profile
|
63 |
+
|
64 |
+
nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2))
|
65 |
+
x_final = np.zeros(nframes * p.len2)
|
66 |
+
|
67 |
+
aa = 0.98
|
68 |
+
mu = 0.98
|
69 |
+
ksi_min = 10 ** (-25 / 10)
|
70 |
+
|
71 |
+
x_old = np.zeros(p.len1)
|
72 |
+
xk_prev = np.zeros(p.len1)
|
73 |
+
noise_mu2 = p.noise_mu2
|
74 |
+
for k in range(0, nframes * p.len2, p.len2):
|
75 |
+
insign = p.win * wav[k:k + p.window_size]
|
76 |
+
|
77 |
+
spec = np.fft.fft(insign, p.n_fft, axis=0)
|
78 |
+
sig = np.absolute(spec)
|
79 |
+
sig2 = sig ** 2
|
80 |
+
|
81 |
+
gammak = np.minimum(sig2 / noise_mu2, 40)
|
82 |
+
|
83 |
+
if xk_prev.all() == 0:
|
84 |
+
ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
|
85 |
+
else:
|
86 |
+
ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
|
87 |
+
ksi = np.maximum(ksi_min, ksi)
|
88 |
+
|
89 |
+
log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi)
|
90 |
+
vad_decision = np.sum(log_sigma_k) / p.window_size
|
91 |
+
if vad_decision < eta:
|
92 |
+
noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
|
93 |
+
|
94 |
+
a = ksi / (1 + ksi)
|
95 |
+
vk = a * gammak
|
96 |
+
ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
|
97 |
+
hw = a * np.exp(ei_vk)
|
98 |
+
sig = sig * hw
|
99 |
+
xk_prev = sig ** 2
|
100 |
+
xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0)
|
101 |
+
xi_w = np.real(xi_w)
|
102 |
+
|
103 |
+
x_final[k:k + p.len2] = x_old + xi_w[0:p.len1]
|
104 |
+
x_old = xi_w[p.len1:p.window_size]
|
105 |
+
|
106 |
+
output = from_float(x_final, dtype)
|
107 |
+
output = np.pad(output, (0, len(wav) - len(output)), mode="constant")
|
108 |
+
return output
|
109 |
+
|
110 |
+
|
111 |
+
## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that
|
112 |
+
## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of
|
113 |
+
## webrctvad
|
114 |
+
# def vad(wav, sampling_rate, eta=0.15, window_size=0):
|
115 |
+
# """
|
116 |
+
# TODO: fix doc
|
117 |
+
# Creates a profile of the noise in a given waveform.
|
118 |
+
#
|
119 |
+
# :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints.
|
120 |
+
# :param sampling_rate: the sampling rate of the audio
|
121 |
+
# :param window_size: the size of the window the logmmse algorithm operates on. A default value
|
122 |
+
# will be picked if left as 0.
|
123 |
+
# :param eta: voice threshold for noise update. While the voice activation detection value is
|
124 |
+
# below this threshold, the noise profile will be continuously updated throughout the audio.
|
125 |
+
# Set to 0 to disable updating the noise profile.
|
126 |
+
# """
|
127 |
+
# wav, dtype = to_float(wav)
|
128 |
+
# wav += np.finfo(np.float64).eps
|
129 |
+
#
|
130 |
+
# if window_size == 0:
|
131 |
+
# window_size = int(math.floor(0.02 * sampling_rate))
|
132 |
+
#
|
133 |
+
# if window_size % 2 == 1:
|
134 |
+
# window_size = window_size + 1
|
135 |
+
#
|
136 |
+
# perc = 50
|
137 |
+
# len1 = int(math.floor(window_size * perc / 100))
|
138 |
+
# len2 = int(window_size - len1)
|
139 |
+
#
|
140 |
+
# win = np.hanning(window_size)
|
141 |
+
# win = win * len2 / np.sum(win)
|
142 |
+
# n_fft = 2 * window_size
|
143 |
+
#
|
144 |
+
# wav_mean = np.zeros(n_fft)
|
145 |
+
# n_frames = len(wav) // window_size
|
146 |
+
# for j in range(0, window_size * n_frames, window_size):
|
147 |
+
# wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0))
|
148 |
+
# noise_mu2 = (wav_mean / n_frames) ** 2
|
149 |
+
#
|
150 |
+
# wav, dtype = to_float(wav)
|
151 |
+
# wav += np.finfo(np.float64).eps
|
152 |
+
#
|
153 |
+
# nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2))
|
154 |
+
# vad = np.zeros(nframes * len2, dtype=np.bool)
|
155 |
+
#
|
156 |
+
# aa = 0.98
|
157 |
+
# mu = 0.98
|
158 |
+
# ksi_min = 10 ** (-25 / 10)
|
159 |
+
#
|
160 |
+
# xk_prev = np.zeros(len1)
|
161 |
+
# noise_mu2 = noise_mu2
|
162 |
+
# for k in range(0, nframes * len2, len2):
|
163 |
+
# insign = win * wav[k:k + window_size]
|
164 |
+
#
|
165 |
+
# spec = np.fft.fft(insign, n_fft, axis=0)
|
166 |
+
# sig = np.absolute(spec)
|
167 |
+
# sig2 = sig ** 2
|
168 |
+
#
|
169 |
+
# gammak = np.minimum(sig2 / noise_mu2, 40)
|
170 |
+
#
|
171 |
+
# if xk_prev.all() == 0:
|
172 |
+
# ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
|
173 |
+
# else:
|
174 |
+
# ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
|
175 |
+
# ksi = np.maximum(ksi_min, ksi)
|
176 |
+
#
|
177 |
+
# log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi)
|
178 |
+
# vad_decision = np.sum(log_sigma_k) / window_size
|
179 |
+
# if vad_decision < eta:
|
180 |
+
# noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
|
181 |
+
# print(vad_decision)
|
182 |
+
#
|
183 |
+
# a = ksi / (1 + ksi)
|
184 |
+
# vk = a * gammak
|
185 |
+
# ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
|
186 |
+
# hw = a * np.exp(ei_vk)
|
187 |
+
# sig = sig * hw
|
188 |
+
# xk_prev = sig ** 2
|
189 |
+
#
|
190 |
+
# vad[k:k + len2] = vad_decision >= eta
|
191 |
+
#
|
192 |
+
# vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant")
|
193 |
+
# return vad
|
194 |
+
|
195 |
+
|
196 |
+
def to_float(_input):
|
197 |
+
if _input.dtype == np.float64:
|
198 |
+
return _input, _input.dtype
|
199 |
+
elif _input.dtype == np.float32:
|
200 |
+
return _input.astype(np.float64), _input.dtype
|
201 |
+
elif _input.dtype == np.uint8:
|
202 |
+
return (_input - 128) / 128., _input.dtype
|
203 |
+
elif _input.dtype == np.int16:
|
204 |
+
return _input / 32768., _input.dtype
|
205 |
+
elif _input.dtype == np.int32:
|
206 |
+
return _input / 2147483648., _input.dtype
|
207 |
+
raise ValueError('Unsupported wave file format')
|
208 |
+
|
209 |
+
|
210 |
+
def from_float(_input, dtype):
|
211 |
+
if dtype == np.float64:
|
212 |
+
return _input, np.float64
|
213 |
+
elif dtype == np.float32:
|
214 |
+
return _input.astype(np.float32)
|
215 |
+
elif dtype == np.uint8:
|
216 |
+
return ((_input * 128) + 128).astype(np.uint8)
|
217 |
+
elif dtype == np.int16:
|
218 |
+
return (_input * 32768).astype(np.int16)
|
219 |
+
elif dtype == np.int32:
|
220 |
+
print(_input)
|
221 |
+
return (_input * 2147483648).astype(np.int32)
|
222 |
+
raise ValueError('Unsupported wave file format')
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/profiler.py
ADDED
@@ -0,0 +1,47 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
from time import perf_counter as timer
|
4 |
+
from collections import OrderedDict
|
5 |
+
import numpy as np
|
6 |
+
|
7 |
+
|
8 |
+
class Profiler:
|
9 |
+
def __init__(self, summarize_every=5, disabled=False):
|
10 |
+
self.last_tick = timer()
|
11 |
+
self.logs = OrderedDict()
|
12 |
+
self.summarize_every = summarize_every
|
13 |
+
self.disabled = disabled
|
14 |
+
|
15 |
+
def tick(self, name):
|
16 |
+
if self.disabled:
|
17 |
+
return
|
18 |
+
|
19 |
+
# Log the time needed to execute that function
|
20 |
+
if not name in self.logs:
|
21 |
+
self.logs[name] = []
|
22 |
+
if len(self.logs[name]) >= self.summarize_every:
|
23 |
+
self.summarize()
|
24 |
+
self.purge_logs()
|
25 |
+
self.logs[name].append(timer() - self.last_tick)
|
26 |
+
|
27 |
+
self.reset_timer()
|
28 |
+
|
29 |
+
def purge_logs(self):
|
30 |
+
for name in self.logs:
|
31 |
+
self.logs[name].clear()
|
32 |
+
|
33 |
+
def reset_timer(self):
|
34 |
+
self.last_tick = timer()
|
35 |
+
|
36 |
+
def summarize(self):
|
37 |
+
n = max(map(len, self.logs.values()))
|
38 |
+
assert n == self.summarize_every
|
39 |
+
print("\nAverage execution time over %d steps:" % n)
|
40 |
+
|
41 |
+
name_msgs = ["%s (%d/%d):" % (name, len(deltas), n) for name, deltas in self.logs.items()]
|
42 |
+
pad = max(map(len, name_msgs))
|
43 |
+
for name_msg, deltas in zip(name_msgs, self.logs.values()):
|
44 |
+
print(" %s mean: %4.0fms std: %4.0fms" %
|
45 |
+
(name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000))
|
46 |
+
print("", flush=True)
|
47 |
+
|
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/visualizations.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
|
2 |
+
|
3 |
+
from .data_objects.speaker_verification_dataset import SpeakerVerificationDataset
|
4 |
+
from datetime import datetime
|
5 |
+
from time import perf_counter as timer
|
6 |
+
import matplotlib.pyplot as plt
|
7 |
+
import numpy as np
|
8 |
+
# import webbrowser
|
9 |
+
import visdom
|
10 |
+
import umap
|
11 |
+
|
12 |
+
colormap = np.array([
|
13 |
+
[76, 255, 0],
|
14 |
+
[0, 127, 70],
|
15 |
+
[255, 0, 0],
|
16 |
+
[255, 217, 38],
|
17 |
+
[0, 135, 255],
|
18 |
+
[165, 0, 165],
|
19 |
+
[255, 167, 255],
|
20 |
+
[0, 255, 255],
|
21 |
+
[255, 96, 38],
|
22 |
+
[142, 76, 0],
|
23 |
+
[33, 0, 127],
|
24 |
+
[0, 0, 0],
|
25 |
+
[183, 183, 183],
|
26 |
+
], dtype=np.float) / 255
|
27 |
+
|
28 |
+
|
29 |
+
class Visualizations:
|
30 |
+
def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
|
31 |
+
# Tracking data
|
32 |
+
self.last_update_timestamp = timer()
|
33 |
+
self.update_every = update_every
|
34 |
+
self.step_times = []
|
35 |
+
self.losses = []
|
36 |
+
self.eers = []
|
37 |
+
print("Updating the visualizations every %d steps." % update_every)
|
38 |
+
|
39 |
+
# If visdom is disabled TODO: use a better paradigm for that
|
40 |
+
self.disabled = disabled
|
41 |
+
if self.disabled:
|
42 |
+
return
|
43 |
+
|
44 |
+
# Set the environment name
|
45 |
+
now = str(datetime.now().strftime("%d-%m %Hh%M"))
|
46 |
+
if env_name is None:
|
47 |
+
self.env_name = now
|
48 |
+
else:
|
49 |
+
self.env_name = "%s (%s)" % (env_name, now)
|
50 |
+
|
51 |
+
# Connect to visdom and open the corresponding window in the browser
|
52 |
+
try:
|
53 |
+
self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
|
54 |
+
except ConnectionError:
|
55 |
+
raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
|
56 |
+
"start it.")
|
57 |
+
# webbrowser.open("http://localhost:8097/env/" + self.env_name)
|
58 |
+
|
59 |
+
# Create the windows
|
60 |
+
self.loss_win = None
|
61 |
+
self.eer_win = None
|
62 |
+
# self.lr_win = None
|
63 |
+
self.implementation_win = None
|
64 |
+
self.projection_win = None
|
65 |
+
self.implementation_string = ""
|
66 |
+
|
67 |
+
def log_params(self):
|
68 |
+
if self.disabled:
|
69 |
+
return
|
70 |
+
from encoder import params_data
|
71 |
+
from encoder import params_model
|
72 |
+
param_string = "<b>Model parameters</b>:<br>"
|
73 |
+
for param_name in (p for p in dir(params_model) if not p.startswith("__")):
|
74 |
+
value = getattr(params_model, param_name)
|
75 |
+
param_string += "\t%s: %s<br>" % (param_name, value)
|
76 |
+
param_string += "<b>Data parameters</b>:<br>"
|
77 |
+
for param_name in (p for p in dir(params_data) if not p.startswith("__")):
|
78 |
+
value = getattr(params_data, param_name)
|
79 |
+
param_string += "\t%s: %s<br>" % (param_name, value)
|
80 |
+
self.vis.text(param_string, opts={"title": "Parameters"})
|
81 |
+
|
82 |
+
def log_dataset(self, dataset: SpeakerVerificationDataset):
|
83 |
+
if self.disabled:
|
84 |
+
return
|
85 |
+
dataset_string = ""
|
86 |
+
dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
|
87 |
+
dataset_string += "\n" + dataset.get_logs()
|
88 |
+
dataset_string = dataset_string.replace("\n", "<br>")
|
89 |
+
self.vis.text(dataset_string, opts={"title": "Dataset"})
|
90 |
+
|
91 |
+
def log_implementation(self, params):
|
92 |
+
if self.disabled:
|
93 |
+
return
|
94 |
+
implementation_string = ""
|
95 |
+
for param, value in params.items():
|
96 |
+
implementation_string += "<b>%s</b>: %s\n" % (param, value)
|
97 |
+
implementation_string = implementation_string.replace("\n", "<br>")
|
98 |
+
self.implementation_string = implementation_string
|
99 |
+
self.implementation_win = self.vis.text(
|
100 |
+
implementation_string,
|
101 |
+
opts={"title": "Training implementation"}
|
102 |
+
)
|
103 |
+
|
104 |
+
def update(self, loss, eer, step):
|
105 |
+
# Update the tracking data
|
106 |
+
now = timer()
|
107 |
+
self.step_times.append(1000 * (now - self.last_update_timestamp))
|
108 |
+
self.last_update_timestamp = now
|
109 |
+
self.losses.append(loss)
|
110 |
+
self.eers.append(eer)
|
111 |
+
print(".", end="")
|
112 |
+
|
113 |
+
# Update the plots every <update_every> steps
|
114 |
+
if step % self.update_every != 0:
|
115 |
+
return
|
116 |
+
time_string = "Step time: mean: %5dms std: %5dms" % \
|
117 |
+
(int(np.mean(self.step_times)), int(np.std(self.step_times)))
|
118 |
+
print("\nStep %6d Loss: %.4f EER: %.4f %s" %
|
119 |
+
(step, np.mean(self.losses), np.mean(self.eers), time_string))
|
120 |
+
if not self.disabled:
|
121 |
+
self.loss_win = self.vis.line(
|
122 |
+
[np.mean(self.losses)],
|
123 |
+
[step],
|
124 |
+
win=self.loss_win,
|
125 |
+
update="append" if self.loss_win else None,
|
126 |
+
opts=dict(
|
127 |
+
legend=["Avg. loss"],
|
128 |
+
xlabel="Step",
|
129 |
+
ylabel="Loss",
|
130 |
+
title="Loss",
|
131 |
+
)
|
132 |
+
)
|
133 |
+
self.eer_win = self.vis.line(
|
134 |
+
[np.mean(self.eers)],
|
135 |
+
[step],
|
136 |
+
win=self.eer_win,
|
137 |
+
update="append" if self.eer_win else None,
|
138 |
+
opts=dict(
|
139 |
+
legend=["Avg. EER"],
|
140 |
+
xlabel="Step",
|
141 |
+
ylabel="EER",
|
142 |
+
title="Equal error rate"
|
143 |
+
)
|
144 |
+
)
|
145 |
+
if self.implementation_win is not None:
|
146 |
+
self.vis.text(
|
147 |
+
self.implementation_string + ("<b>%s</b>" % time_string),
|
148 |
+
win=self.implementation_win,
|
149 |
+
opts={"title": "Training implementation"},
|
150 |
+
)
|
151 |
+
|
152 |
+
# Reset the tracking
|
153 |
+
self.losses.clear()
|
154 |
+
self.eers.clear()
|
155 |
+
self.step_times.clear()
|
156 |
+
|
157 |
+
def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
|
158 |
+
max_speakers=10):
|
159 |
+
max_speakers = min(max_speakers, len(colormap))
|
160 |
+
embeds = embeds[:max_speakers * utterances_per_speaker]
|
161 |
+
|
162 |
+
n_speakers = len(embeds) // utterances_per_speaker
|
163 |
+
ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
|
164 |
+
colors = [colormap[i] for i in ground_truth]
|
165 |
+
|
166 |
+
reducer = umap.UMAP()
|
167 |
+
projected = reducer.fit_transform(embeds)
|
168 |
+
plt.scatter(projected[:, 0], projected[:, 1], c=colors)
|
169 |
+
plt.gca().set_aspect("equal", "datalim")
|
170 |
+
plt.title("UMAP projection (step %d)" % step)
|
171 |
+
if not self.disabled:
|
172 |
+
self.projection_win = self.vis.matplot(plt, win=self.projection_win)
|
173 |
+
if out_fpath is not None:
|
174 |
+
plt.savefig(out_fpath)
|
175 |
+
plt.clf()
|
176 |
+
|
177 |
+
def save(self):
|
178 |
+
if not self.disabled:
|
179 |
+
self.vis.save([self.env_name])
|
180 |
+
|
dreamvoice/train_utils/src/openvoice/__init__.py
ADDED
File without changes
|
dreamvoice/train_utils/src/openvoice/api.py
ADDED
@@ -0,0 +1,202 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import numpy as np
|
3 |
+
import re
|
4 |
+
import soundfile
|
5 |
+
from openvoice import utils
|
6 |
+
from openvoice import commons
|
7 |
+
import os
|
8 |
+
import librosa
|
9 |
+
from openvoice.text import text_to_sequence
|
10 |
+
from openvoice.mel_processing import spectrogram_torch
|
11 |
+
from openvoice.models import SynthesizerTrn
|
12 |
+
|
13 |
+
|
14 |
+
class OpenVoiceBaseClass(object):
|
15 |
+
def __init__(self,
|
16 |
+
config_path,
|
17 |
+
device='cuda:0'):
|
18 |
+
if 'cuda' in device:
|
19 |
+
assert torch.cuda.is_available()
|
20 |
+
|
21 |
+
hps = utils.get_hparams_from_file(config_path)
|
22 |
+
|
23 |
+
model = SynthesizerTrn(
|
24 |
+
len(getattr(hps, 'symbols', [])),
|
25 |
+
hps.data.filter_length // 2 + 1,
|
26 |
+
n_speakers=hps.data.n_speakers,
|
27 |
+
**hps.model,
|
28 |
+
).to(device)
|
29 |
+
|
30 |
+
model.eval()
|
31 |
+
self.model = model
|
32 |
+
self.hps = hps
|
33 |
+
self.device = device
|
34 |
+
|
35 |
+
def load_ckpt(self, ckpt_path):
|
36 |
+
checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
|
37 |
+
a, b = self.model.load_state_dict(checkpoint_dict['model'], strict=False)
|
38 |
+
print("Loaded checkpoint '{}'".format(ckpt_path))
|
39 |
+
print('missing/unexpected keys:', a, b)
|
40 |
+
|
41 |
+
|
42 |
+
class BaseSpeakerTTS(OpenVoiceBaseClass):
|
43 |
+
language_marks = {
|
44 |
+
"english": "EN",
|
45 |
+
"chinese": "ZH",
|
46 |
+
}
|
47 |
+
|
48 |
+
@staticmethod
|
49 |
+
def get_text(text, hps, is_symbol):
|
50 |
+
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
|
51 |
+
if hps.data.add_blank:
|
52 |
+
text_norm = commons.intersperse(text_norm, 0)
|
53 |
+
text_norm = torch.LongTensor(text_norm)
|
54 |
+
return text_norm
|
55 |
+
|
56 |
+
@staticmethod
|
57 |
+
def audio_numpy_concat(segment_data_list, sr, speed=1.):
|
58 |
+
audio_segments = []
|
59 |
+
for segment_data in segment_data_list:
|
60 |
+
audio_segments += segment_data.reshape(-1).tolist()
|
61 |
+
audio_segments += [0] * int((sr * 0.05)/speed)
|
62 |
+
audio_segments = np.array(audio_segments).astype(np.float32)
|
63 |
+
return audio_segments
|
64 |
+
|
65 |
+
@staticmethod
|
66 |
+
def split_sentences_into_pieces(text, language_str):
|
67 |
+
texts = utils.split_sentence(text, language_str=language_str)
|
68 |
+
print(" > Text splitted to sentences.")
|
69 |
+
print('\n'.join(texts))
|
70 |
+
print(" > ===========================")
|
71 |
+
return texts
|
72 |
+
|
73 |
+
def tts(self, text, output_path, speaker, language='English', speed=1.0):
|
74 |
+
mark = self.language_marks.get(language.lower(), None)
|
75 |
+
assert mark is not None, f"language {language} is not supported"
|
76 |
+
|
77 |
+
texts = self.split_sentences_into_pieces(text, mark)
|
78 |
+
|
79 |
+
audio_list = []
|
80 |
+
for t in texts:
|
81 |
+
t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
|
82 |
+
t = f'[{mark}]{t}[{mark}]'
|
83 |
+
stn_tst = self.get_text(t, self.hps, False)
|
84 |
+
device = self.device
|
85 |
+
speaker_id = self.hps.speakers[speaker]
|
86 |
+
with torch.no_grad():
|
87 |
+
x_tst = stn_tst.unsqueeze(0).to(device)
|
88 |
+
x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
|
89 |
+
sid = torch.LongTensor([speaker_id]).to(device)
|
90 |
+
audio = self.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6,
|
91 |
+
length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
|
92 |
+
audio_list.append(audio)
|
93 |
+
audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed)
|
94 |
+
|
95 |
+
if output_path is None:
|
96 |
+
return audio
|
97 |
+
else:
|
98 |
+
soundfile.write(output_path, audio, self.hps.data.sampling_rate)
|
99 |
+
|
100 |
+
|
101 |
+
class ToneColorConverter(OpenVoiceBaseClass):
|
102 |
+
def __init__(self, *args, **kwargs):
|
103 |
+
super().__init__(*args, **kwargs)
|
104 |
+
|
105 |
+
if kwargs.get('enable_watermark', True):
|
106 |
+
import wavmark
|
107 |
+
self.watermark_model = wavmark.load_model().to(self.device)
|
108 |
+
else:
|
109 |
+
self.watermark_model = None
|
110 |
+
self.version = getattr(self.hps, '_version_', "v1")
|
111 |
+
|
112 |
+
|
113 |
+
|
114 |
+
def extract_se(self, ref_wav_list, se_save_path=None):
|
115 |
+
if isinstance(ref_wav_list, str):
|
116 |
+
ref_wav_list = [ref_wav_list]
|
117 |
+
|
118 |
+
device = self.device
|
119 |
+
hps = self.hps
|
120 |
+
gs = []
|
121 |
+
|
122 |
+
for fname in ref_wav_list:
|
123 |
+
audio_ref, sr = librosa.load(fname, sr=hps.data.sampling_rate)
|
124 |
+
y = torch.FloatTensor(audio_ref)
|
125 |
+
y = y.to(device)
|
126 |
+
y = y.unsqueeze(0)
|
127 |
+
y = spectrogram_torch(y, hps.data.filter_length,
|
128 |
+
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
129 |
+
center=False).to(device)
|
130 |
+
with torch.no_grad():
|
131 |
+
g = self.model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
|
132 |
+
gs.append(g.detach())
|
133 |
+
gs = torch.stack(gs).mean(0)
|
134 |
+
|
135 |
+
if se_save_path is not None:
|
136 |
+
os.makedirs(os.path.dirname(se_save_path), exist_ok=True)
|
137 |
+
torch.save(gs.cpu(), se_save_path)
|
138 |
+
|
139 |
+
return gs
|
140 |
+
|
141 |
+
def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3, message="default"):
|
142 |
+
hps = self.hps
|
143 |
+
# load audio
|
144 |
+
audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
|
145 |
+
audio = torch.tensor(audio).float()
|
146 |
+
|
147 |
+
with torch.no_grad():
|
148 |
+
y = torch.FloatTensor(audio).to(self.device)
|
149 |
+
y = y.unsqueeze(0)
|
150 |
+
spec = spectrogram_torch(y, hps.data.filter_length,
|
151 |
+
hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
|
152 |
+
center=False).to(self.device)
|
153 |
+
spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
|
154 |
+
audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
|
155 |
+
0, 0].data.cpu().float().numpy()
|
156 |
+
audio = self.add_watermark(audio, message)
|
157 |
+
if output_path is None:
|
158 |
+
return audio
|
159 |
+
else:
|
160 |
+
soundfile.write(output_path, audio, hps.data.sampling_rate)
|
161 |
+
|
162 |
+
def add_watermark(self, audio, message):
|
163 |
+
if self.watermark_model is None:
|
164 |
+
return audio
|
165 |
+
device = self.device
|
166 |
+
bits = utils.string_to_bits(message).reshape(-1)
|
167 |
+
n_repeat = len(bits) // 32
|
168 |
+
|
169 |
+
K = 16000
|
170 |
+
coeff = 2
|
171 |
+
for n in range(n_repeat):
|
172 |
+
trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
|
173 |
+
if len(trunck) != K:
|
174 |
+
print('Audio too short, fail to add watermark')
|
175 |
+
break
|
176 |
+
message_npy = bits[n * 32: (n + 1) * 32]
|
177 |
+
|
178 |
+
with torch.no_grad():
|
179 |
+
signal = torch.FloatTensor(trunck).to(device)[None]
|
180 |
+
message_tensor = torch.FloatTensor(message_npy).to(device)[None]
|
181 |
+
signal_wmd_tensor = self.watermark_model.encode(signal, message_tensor)
|
182 |
+
signal_wmd_npy = signal_wmd_tensor.detach().cpu().squeeze()
|
183 |
+
audio[(coeff * n) * K: (coeff * n + 1) * K] = signal_wmd_npy
|
184 |
+
return audio
|
185 |
+
|
186 |
+
def detect_watermark(self, audio, n_repeat):
|
187 |
+
bits = []
|
188 |
+
K = 16000
|
189 |
+
coeff = 2
|
190 |
+
for n in range(n_repeat):
|
191 |
+
trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
|
192 |
+
if len(trunck) != K:
|
193 |
+
print('Audio too short, fail to detect watermark')
|
194 |
+
return 'Fail'
|
195 |
+
with torch.no_grad():
|
196 |
+
signal = torch.FloatTensor(trunck).to(self.device).unsqueeze(0)
|
197 |
+
message_decoded_npy = (self.watermark_model.decode(signal) >= 0.5).int().detach().cpu().numpy().squeeze()
|
198 |
+
bits.append(message_decoded_npy)
|
199 |
+
bits = np.stack(bits).reshape(-1, 8)
|
200 |
+
message = utils.bits_to_string(bits)
|
201 |
+
return message
|
202 |
+
|
dreamvoice/train_utils/src/openvoice/attentions.py
ADDED
@@ -0,0 +1,465 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from torch import nn
|
4 |
+
from torch.nn import functional as F
|
5 |
+
|
6 |
+
from openvoice import commons
|
7 |
+
import logging
|
8 |
+
|
9 |
+
logger = logging.getLogger(__name__)
|
10 |
+
|
11 |
+
|
12 |
+
class LayerNorm(nn.Module):
|
13 |
+
def __init__(self, channels, eps=1e-5):
|
14 |
+
super().__init__()
|
15 |
+
self.channels = channels
|
16 |
+
self.eps = eps
|
17 |
+
|
18 |
+
self.gamma = nn.Parameter(torch.ones(channels))
|
19 |
+
self.beta = nn.Parameter(torch.zeros(channels))
|
20 |
+
|
21 |
+
def forward(self, x):
|
22 |
+
x = x.transpose(1, -1)
|
23 |
+
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
24 |
+
return x.transpose(1, -1)
|
25 |
+
|
26 |
+
|
27 |
+
@torch.jit.script
|
28 |
+
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
29 |
+
n_channels_int = n_channels[0]
|
30 |
+
in_act = input_a + input_b
|
31 |
+
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
32 |
+
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
33 |
+
acts = t_act * s_act
|
34 |
+
return acts
|
35 |
+
|
36 |
+
|
37 |
+
class Encoder(nn.Module):
|
38 |
+
def __init__(
|
39 |
+
self,
|
40 |
+
hidden_channels,
|
41 |
+
filter_channels,
|
42 |
+
n_heads,
|
43 |
+
n_layers,
|
44 |
+
kernel_size=1,
|
45 |
+
p_dropout=0.0,
|
46 |
+
window_size=4,
|
47 |
+
isflow=True,
|
48 |
+
**kwargs
|
49 |
+
):
|
50 |
+
super().__init__()
|
51 |
+
self.hidden_channels = hidden_channels
|
52 |
+
self.filter_channels = filter_channels
|
53 |
+
self.n_heads = n_heads
|
54 |
+
self.n_layers = n_layers
|
55 |
+
self.kernel_size = kernel_size
|
56 |
+
self.p_dropout = p_dropout
|
57 |
+
self.window_size = window_size
|
58 |
+
# if isflow:
|
59 |
+
# cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
|
60 |
+
# self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
|
61 |
+
# self.cond_layer = weight_norm(cond_layer, name='weight')
|
62 |
+
# self.gin_channels = 256
|
63 |
+
self.cond_layer_idx = self.n_layers
|
64 |
+
if "gin_channels" in kwargs:
|
65 |
+
self.gin_channels = kwargs["gin_channels"]
|
66 |
+
if self.gin_channels != 0:
|
67 |
+
self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
|
68 |
+
# vits2 says 3rd block, so idx is 2 by default
|
69 |
+
self.cond_layer_idx = (
|
70 |
+
kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
|
71 |
+
)
|
72 |
+
# logging.debug(self.gin_channels, self.cond_layer_idx)
|
73 |
+
assert (
|
74 |
+
self.cond_layer_idx < self.n_layers
|
75 |
+
), "cond_layer_idx should be less than n_layers"
|
76 |
+
self.drop = nn.Dropout(p_dropout)
|
77 |
+
self.attn_layers = nn.ModuleList()
|
78 |
+
self.norm_layers_1 = nn.ModuleList()
|
79 |
+
self.ffn_layers = nn.ModuleList()
|
80 |
+
self.norm_layers_2 = nn.ModuleList()
|
81 |
+
|
82 |
+
for i in range(self.n_layers):
|
83 |
+
self.attn_layers.append(
|
84 |
+
MultiHeadAttention(
|
85 |
+
hidden_channels,
|
86 |
+
hidden_channels,
|
87 |
+
n_heads,
|
88 |
+
p_dropout=p_dropout,
|
89 |
+
window_size=window_size,
|
90 |
+
)
|
91 |
+
)
|
92 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
93 |
+
self.ffn_layers.append(
|
94 |
+
FFN(
|
95 |
+
hidden_channels,
|
96 |
+
hidden_channels,
|
97 |
+
filter_channels,
|
98 |
+
kernel_size,
|
99 |
+
p_dropout=p_dropout,
|
100 |
+
)
|
101 |
+
)
|
102 |
+
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
103 |
+
|
104 |
+
def forward(self, x, x_mask, g=None):
|
105 |
+
attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
106 |
+
x = x * x_mask
|
107 |
+
for i in range(self.n_layers):
|
108 |
+
if i == self.cond_layer_idx and g is not None:
|
109 |
+
g = self.spk_emb_linear(g.transpose(1, 2))
|
110 |
+
g = g.transpose(1, 2)
|
111 |
+
x = x + g
|
112 |
+
x = x * x_mask
|
113 |
+
y = self.attn_layers[i](x, x, attn_mask)
|
114 |
+
y = self.drop(y)
|
115 |
+
x = self.norm_layers_1[i](x + y)
|
116 |
+
|
117 |
+
y = self.ffn_layers[i](x, x_mask)
|
118 |
+
y = self.drop(y)
|
119 |
+
x = self.norm_layers_2[i](x + y)
|
120 |
+
x = x * x_mask
|
121 |
+
return x
|
122 |
+
|
123 |
+
|
124 |
+
class Decoder(nn.Module):
|
125 |
+
def __init__(
|
126 |
+
self,
|
127 |
+
hidden_channels,
|
128 |
+
filter_channels,
|
129 |
+
n_heads,
|
130 |
+
n_layers,
|
131 |
+
kernel_size=1,
|
132 |
+
p_dropout=0.0,
|
133 |
+
proximal_bias=False,
|
134 |
+
proximal_init=True,
|
135 |
+
**kwargs
|
136 |
+
):
|
137 |
+
super().__init__()
|
138 |
+
self.hidden_channels = hidden_channels
|
139 |
+
self.filter_channels = filter_channels
|
140 |
+
self.n_heads = n_heads
|
141 |
+
self.n_layers = n_layers
|
142 |
+
self.kernel_size = kernel_size
|
143 |
+
self.p_dropout = p_dropout
|
144 |
+
self.proximal_bias = proximal_bias
|
145 |
+
self.proximal_init = proximal_init
|
146 |
+
|
147 |
+
self.drop = nn.Dropout(p_dropout)
|
148 |
+
self.self_attn_layers = nn.ModuleList()
|
149 |
+
self.norm_layers_0 = nn.ModuleList()
|
150 |
+
self.encdec_attn_layers = nn.ModuleList()
|
151 |
+
self.norm_layers_1 = nn.ModuleList()
|
152 |
+
self.ffn_layers = nn.ModuleList()
|
153 |
+
self.norm_layers_2 = nn.ModuleList()
|
154 |
+
for i in range(self.n_layers):
|
155 |
+
self.self_attn_layers.append(
|
156 |
+
MultiHeadAttention(
|
157 |
+
hidden_channels,
|
158 |
+
hidden_channels,
|
159 |
+
n_heads,
|
160 |
+
p_dropout=p_dropout,
|
161 |
+
proximal_bias=proximal_bias,
|
162 |
+
proximal_init=proximal_init,
|
163 |
+
)
|
164 |
+
)
|
165 |
+
self.norm_layers_0.append(LayerNorm(hidden_channels))
|
166 |
+
self.encdec_attn_layers.append(
|
167 |
+
MultiHeadAttention(
|
168 |
+
hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
|
169 |
+
)
|
170 |
+
)
|
171 |
+
self.norm_layers_1.append(LayerNorm(hidden_channels))
|
172 |
+
self.ffn_layers.append(
|
173 |
+
FFN(
|
174 |
+
hidden_channels,
|
175 |
+
hidden_channels,
|
176 |
+
filter_channels,
|
177 |
+
kernel_size,
|
178 |
+
p_dropout=p_dropout,
|
179 |
+
causal=True,
|
180 |
+
)
|
181 |
+
)
|
182 |
+
self.norm_layers_2.append(LayerNorm(hidden_channels))
|
183 |
+
|
184 |
+
def forward(self, x, x_mask, h, h_mask):
|
185 |
+
"""
|
186 |
+
x: decoder input
|
187 |
+
h: encoder output
|
188 |
+
"""
|
189 |
+
self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
|
190 |
+
device=x.device, dtype=x.dtype
|
191 |
+
)
|
192 |
+
encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
|
193 |
+
x = x * x_mask
|
194 |
+
for i in range(self.n_layers):
|
195 |
+
y = self.self_attn_layers[i](x, x, self_attn_mask)
|
196 |
+
y = self.drop(y)
|
197 |
+
x = self.norm_layers_0[i](x + y)
|
198 |
+
|
199 |
+
y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
|
200 |
+
y = self.drop(y)
|
201 |
+
x = self.norm_layers_1[i](x + y)
|
202 |
+
|
203 |
+
y = self.ffn_layers[i](x, x_mask)
|
204 |
+
y = self.drop(y)
|
205 |
+
x = self.norm_layers_2[i](x + y)
|
206 |
+
x = x * x_mask
|
207 |
+
return x
|
208 |
+
|
209 |
+
|
210 |
+
class MultiHeadAttention(nn.Module):
|
211 |
+
def __init__(
|
212 |
+
self,
|
213 |
+
channels,
|
214 |
+
out_channels,
|
215 |
+
n_heads,
|
216 |
+
p_dropout=0.0,
|
217 |
+
window_size=None,
|
218 |
+
heads_share=True,
|
219 |
+
block_length=None,
|
220 |
+
proximal_bias=False,
|
221 |
+
proximal_init=False,
|
222 |
+
):
|
223 |
+
super().__init__()
|
224 |
+
assert channels % n_heads == 0
|
225 |
+
|
226 |
+
self.channels = channels
|
227 |
+
self.out_channels = out_channels
|
228 |
+
self.n_heads = n_heads
|
229 |
+
self.p_dropout = p_dropout
|
230 |
+
self.window_size = window_size
|
231 |
+
self.heads_share = heads_share
|
232 |
+
self.block_length = block_length
|
233 |
+
self.proximal_bias = proximal_bias
|
234 |
+
self.proximal_init = proximal_init
|
235 |
+
self.attn = None
|
236 |
+
|
237 |
+
self.k_channels = channels // n_heads
|
238 |
+
self.conv_q = nn.Conv1d(channels, channels, 1)
|
239 |
+
self.conv_k = nn.Conv1d(channels, channels, 1)
|
240 |
+
self.conv_v = nn.Conv1d(channels, channels, 1)
|
241 |
+
self.conv_o = nn.Conv1d(channels, out_channels, 1)
|
242 |
+
self.drop = nn.Dropout(p_dropout)
|
243 |
+
|
244 |
+
if window_size is not None:
|
245 |
+
n_heads_rel = 1 if heads_share else n_heads
|
246 |
+
rel_stddev = self.k_channels**-0.5
|
247 |
+
self.emb_rel_k = nn.Parameter(
|
248 |
+
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
|
249 |
+
* rel_stddev
|
250 |
+
)
|
251 |
+
self.emb_rel_v = nn.Parameter(
|
252 |
+
torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
|
253 |
+
* rel_stddev
|
254 |
+
)
|
255 |
+
|
256 |
+
nn.init.xavier_uniform_(self.conv_q.weight)
|
257 |
+
nn.init.xavier_uniform_(self.conv_k.weight)
|
258 |
+
nn.init.xavier_uniform_(self.conv_v.weight)
|
259 |
+
if proximal_init:
|
260 |
+
with torch.no_grad():
|
261 |
+
self.conv_k.weight.copy_(self.conv_q.weight)
|
262 |
+
self.conv_k.bias.copy_(self.conv_q.bias)
|
263 |
+
|
264 |
+
def forward(self, x, c, attn_mask=None):
|
265 |
+
q = self.conv_q(x)
|
266 |
+
k = self.conv_k(c)
|
267 |
+
v = self.conv_v(c)
|
268 |
+
|
269 |
+
x, self.attn = self.attention(q, k, v, mask=attn_mask)
|
270 |
+
|
271 |
+
x = self.conv_o(x)
|
272 |
+
return x
|
273 |
+
|
274 |
+
def attention(self, query, key, value, mask=None):
|
275 |
+
# reshape [b, d, t] -> [b, n_h, t, d_k]
|
276 |
+
b, d, t_s, t_t = (*key.size(), query.size(2))
|
277 |
+
query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
|
278 |
+
key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
279 |
+
value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
|
280 |
+
|
281 |
+
scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
|
282 |
+
if self.window_size is not None:
|
283 |
+
assert (
|
284 |
+
t_s == t_t
|
285 |
+
), "Relative attention is only available for self-attention."
|
286 |
+
key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
|
287 |
+
rel_logits = self._matmul_with_relative_keys(
|
288 |
+
query / math.sqrt(self.k_channels), key_relative_embeddings
|
289 |
+
)
|
290 |
+
scores_local = self._relative_position_to_absolute_position(rel_logits)
|
291 |
+
scores = scores + scores_local
|
292 |
+
if self.proximal_bias:
|
293 |
+
assert t_s == t_t, "Proximal bias is only available for self-attention."
|
294 |
+
scores = scores + self._attention_bias_proximal(t_s).to(
|
295 |
+
device=scores.device, dtype=scores.dtype
|
296 |
+
)
|
297 |
+
if mask is not None:
|
298 |
+
scores = scores.masked_fill(mask == 0, -1e4)
|
299 |
+
if self.block_length is not None:
|
300 |
+
assert (
|
301 |
+
t_s == t_t
|
302 |
+
), "Local attention is only available for self-attention."
|
303 |
+
block_mask = (
|
304 |
+
torch.ones_like(scores)
|
305 |
+
.triu(-self.block_length)
|
306 |
+
.tril(self.block_length)
|
307 |
+
)
|
308 |
+
scores = scores.masked_fill(block_mask == 0, -1e4)
|
309 |
+
p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
|
310 |
+
p_attn = self.drop(p_attn)
|
311 |
+
output = torch.matmul(p_attn, value)
|
312 |
+
if self.window_size is not None:
|
313 |
+
relative_weights = self._absolute_position_to_relative_position(p_attn)
|
314 |
+
value_relative_embeddings = self._get_relative_embeddings(
|
315 |
+
self.emb_rel_v, t_s
|
316 |
+
)
|
317 |
+
output = output + self._matmul_with_relative_values(
|
318 |
+
relative_weights, value_relative_embeddings
|
319 |
+
)
|
320 |
+
output = (
|
321 |
+
output.transpose(2, 3).contiguous().view(b, d, t_t)
|
322 |
+
) # [b, n_h, t_t, d_k] -> [b, d, t_t]
|
323 |
+
return output, p_attn
|
324 |
+
|
325 |
+
def _matmul_with_relative_values(self, x, y):
|
326 |
+
"""
|
327 |
+
x: [b, h, l, m]
|
328 |
+
y: [h or 1, m, d]
|
329 |
+
ret: [b, h, l, d]
|
330 |
+
"""
|
331 |
+
ret = torch.matmul(x, y.unsqueeze(0))
|
332 |
+
return ret
|
333 |
+
|
334 |
+
def _matmul_with_relative_keys(self, x, y):
|
335 |
+
"""
|
336 |
+
x: [b, h, l, d]
|
337 |
+
y: [h or 1, m, d]
|
338 |
+
ret: [b, h, l, m]
|
339 |
+
"""
|
340 |
+
ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
|
341 |
+
return ret
|
342 |
+
|
343 |
+
def _get_relative_embeddings(self, relative_embeddings, length):
|
344 |
+
2 * self.window_size + 1
|
345 |
+
# Pad first before slice to avoid using cond ops.
|
346 |
+
pad_length = max(length - (self.window_size + 1), 0)
|
347 |
+
slice_start_position = max((self.window_size + 1) - length, 0)
|
348 |
+
slice_end_position = slice_start_position + 2 * length - 1
|
349 |
+
if pad_length > 0:
|
350 |
+
padded_relative_embeddings = F.pad(
|
351 |
+
relative_embeddings,
|
352 |
+
commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
|
353 |
+
)
|
354 |
+
else:
|
355 |
+
padded_relative_embeddings = relative_embeddings
|
356 |
+
used_relative_embeddings = padded_relative_embeddings[
|
357 |
+
:, slice_start_position:slice_end_position
|
358 |
+
]
|
359 |
+
return used_relative_embeddings
|
360 |
+
|
361 |
+
def _relative_position_to_absolute_position(self, x):
|
362 |
+
"""
|
363 |
+
x: [b, h, l, 2*l-1]
|
364 |
+
ret: [b, h, l, l]
|
365 |
+
"""
|
366 |
+
batch, heads, length, _ = x.size()
|
367 |
+
# Concat columns of pad to shift from relative to absolute indexing.
|
368 |
+
x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
|
369 |
+
|
370 |
+
# Concat extra elements so to add up to shape (len+1, 2*len-1).
|
371 |
+
x_flat = x.view([batch, heads, length * 2 * length])
|
372 |
+
x_flat = F.pad(
|
373 |
+
x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
|
374 |
+
)
|
375 |
+
|
376 |
+
# Reshape and slice out the padded elements.
|
377 |
+
x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
|
378 |
+
:, :, :length, length - 1 :
|
379 |
+
]
|
380 |
+
return x_final
|
381 |
+
|
382 |
+
def _absolute_position_to_relative_position(self, x):
|
383 |
+
"""
|
384 |
+
x: [b, h, l, l]
|
385 |
+
ret: [b, h, l, 2*l-1]
|
386 |
+
"""
|
387 |
+
batch, heads, length, _ = x.size()
|
388 |
+
# pad along column
|
389 |
+
x = F.pad(
|
390 |
+
x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
|
391 |
+
)
|
392 |
+
x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
|
393 |
+
# add 0's in the beginning that will skew the elements after reshape
|
394 |
+
x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
|
395 |
+
x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
|
396 |
+
return x_final
|
397 |
+
|
398 |
+
def _attention_bias_proximal(self, length):
|
399 |
+
"""Bias for self-attention to encourage attention to close positions.
|
400 |
+
Args:
|
401 |
+
length: an integer scalar.
|
402 |
+
Returns:
|
403 |
+
a Tensor with shape [1, 1, length, length]
|
404 |
+
"""
|
405 |
+
r = torch.arange(length, dtype=torch.float32)
|
406 |
+
diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
|
407 |
+
return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
|
408 |
+
|
409 |
+
|
410 |
+
class FFN(nn.Module):
|
411 |
+
def __init__(
|
412 |
+
self,
|
413 |
+
in_channels,
|
414 |
+
out_channels,
|
415 |
+
filter_channels,
|
416 |
+
kernel_size,
|
417 |
+
p_dropout=0.0,
|
418 |
+
activation=None,
|
419 |
+
causal=False,
|
420 |
+
):
|
421 |
+
super().__init__()
|
422 |
+
self.in_channels = in_channels
|
423 |
+
self.out_channels = out_channels
|
424 |
+
self.filter_channels = filter_channels
|
425 |
+
self.kernel_size = kernel_size
|
426 |
+
self.p_dropout = p_dropout
|
427 |
+
self.activation = activation
|
428 |
+
self.causal = causal
|
429 |
+
|
430 |
+
if causal:
|
431 |
+
self.padding = self._causal_padding
|
432 |
+
else:
|
433 |
+
self.padding = self._same_padding
|
434 |
+
|
435 |
+
self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
|
436 |
+
self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
|
437 |
+
self.drop = nn.Dropout(p_dropout)
|
438 |
+
|
439 |
+
def forward(self, x, x_mask):
|
440 |
+
x = self.conv_1(self.padding(x * x_mask))
|
441 |
+
if self.activation == "gelu":
|
442 |
+
x = x * torch.sigmoid(1.702 * x)
|
443 |
+
else:
|
444 |
+
x = torch.relu(x)
|
445 |
+
x = self.drop(x)
|
446 |
+
x = self.conv_2(self.padding(x * x_mask))
|
447 |
+
return x * x_mask
|
448 |
+
|
449 |
+
def _causal_padding(self, x):
|
450 |
+
if self.kernel_size == 1:
|
451 |
+
return x
|
452 |
+
pad_l = self.kernel_size - 1
|
453 |
+
pad_r = 0
|
454 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
455 |
+
x = F.pad(x, commons.convert_pad_shape(padding))
|
456 |
+
return x
|
457 |
+
|
458 |
+
def _same_padding(self, x):
|
459 |
+
if self.kernel_size == 1:
|
460 |
+
return x
|
461 |
+
pad_l = (self.kernel_size - 1) // 2
|
462 |
+
pad_r = self.kernel_size // 2
|
463 |
+
padding = [[0, 0], [0, 0], [pad_l, pad_r]]
|
464 |
+
x = F.pad(x, commons.convert_pad_shape(padding))
|
465 |
+
return x
|
dreamvoice/train_utils/src/openvoice/commons.py
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from torch.nn import functional as F
|
4 |
+
|
5 |
+
|
6 |
+
def init_weights(m, mean=0.0, std=0.01):
|
7 |
+
classname = m.__class__.__name__
|
8 |
+
if classname.find("Conv") != -1:
|
9 |
+
m.weight.data.normal_(mean, std)
|
10 |
+
|
11 |
+
|
12 |
+
def get_padding(kernel_size, dilation=1):
|
13 |
+
return int((kernel_size * dilation - dilation) / 2)
|
14 |
+
|
15 |
+
|
16 |
+
def convert_pad_shape(pad_shape):
|
17 |
+
layer = pad_shape[::-1]
|
18 |
+
pad_shape = [item for sublist in layer for item in sublist]
|
19 |
+
return pad_shape
|
20 |
+
|
21 |
+
|
22 |
+
def intersperse(lst, item):
|
23 |
+
result = [item] * (len(lst) * 2 + 1)
|
24 |
+
result[1::2] = lst
|
25 |
+
return result
|
26 |
+
|
27 |
+
|
28 |
+
def kl_divergence(m_p, logs_p, m_q, logs_q):
|
29 |
+
"""KL(P||Q)"""
|
30 |
+
kl = (logs_q - logs_p) - 0.5
|
31 |
+
kl += (
|
32 |
+
0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
|
33 |
+
)
|
34 |
+
return kl
|
35 |
+
|
36 |
+
|
37 |
+
def rand_gumbel(shape):
|
38 |
+
"""Sample from the Gumbel distribution, protect from overflows."""
|
39 |
+
uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
|
40 |
+
return -torch.log(-torch.log(uniform_samples))
|
41 |
+
|
42 |
+
|
43 |
+
def rand_gumbel_like(x):
|
44 |
+
g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
|
45 |
+
return g
|
46 |
+
|
47 |
+
|
48 |
+
def slice_segments(x, ids_str, segment_size=4):
|
49 |
+
ret = torch.zeros_like(x[:, :, :segment_size])
|
50 |
+
for i in range(x.size(0)):
|
51 |
+
idx_str = ids_str[i]
|
52 |
+
idx_end = idx_str + segment_size
|
53 |
+
ret[i] = x[i, :, idx_str:idx_end]
|
54 |
+
return ret
|
55 |
+
|
56 |
+
|
57 |
+
def rand_slice_segments(x, x_lengths=None, segment_size=4):
|
58 |
+
b, d, t = x.size()
|
59 |
+
if x_lengths is None:
|
60 |
+
x_lengths = t
|
61 |
+
ids_str_max = x_lengths - segment_size + 1
|
62 |
+
ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
|
63 |
+
ret = slice_segments(x, ids_str, segment_size)
|
64 |
+
return ret, ids_str
|
65 |
+
|
66 |
+
|
67 |
+
def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
|
68 |
+
position = torch.arange(length, dtype=torch.float)
|
69 |
+
num_timescales = channels // 2
|
70 |
+
log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
|
71 |
+
num_timescales - 1
|
72 |
+
)
|
73 |
+
inv_timescales = min_timescale * torch.exp(
|
74 |
+
torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
|
75 |
+
)
|
76 |
+
scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
|
77 |
+
signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
|
78 |
+
signal = F.pad(signal, [0, 0, 0, channels % 2])
|
79 |
+
signal = signal.view(1, channels, length)
|
80 |
+
return signal
|
81 |
+
|
82 |
+
|
83 |
+
def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
|
84 |
+
b, channels, length = x.size()
|
85 |
+
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
86 |
+
return x + signal.to(dtype=x.dtype, device=x.device)
|
87 |
+
|
88 |
+
|
89 |
+
def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
|
90 |
+
b, channels, length = x.size()
|
91 |
+
signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
|
92 |
+
return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
|
93 |
+
|
94 |
+
|
95 |
+
def subsequent_mask(length):
|
96 |
+
mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
|
97 |
+
return mask
|
98 |
+
|
99 |
+
|
100 |
+
@torch.jit.script
|
101 |
+
def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
|
102 |
+
n_channels_int = n_channels[0]
|
103 |
+
in_act = input_a + input_b
|
104 |
+
t_act = torch.tanh(in_act[:, :n_channels_int, :])
|
105 |
+
s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
|
106 |
+
acts = t_act * s_act
|
107 |
+
return acts
|
108 |
+
|
109 |
+
|
110 |
+
def convert_pad_shape(pad_shape):
|
111 |
+
layer = pad_shape[::-1]
|
112 |
+
pad_shape = [item for sublist in layer for item in sublist]
|
113 |
+
return pad_shape
|
114 |
+
|
115 |
+
|
116 |
+
def shift_1d(x):
|
117 |
+
x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
|
118 |
+
return x
|
119 |
+
|
120 |
+
|
121 |
+
def sequence_mask(length, max_length=None):
|
122 |
+
if max_length is None:
|
123 |
+
max_length = length.max()
|
124 |
+
x = torch.arange(max_length, dtype=length.dtype, device=length.device)
|
125 |
+
return x.unsqueeze(0) < length.unsqueeze(1)
|
126 |
+
|
127 |
+
|
128 |
+
def generate_path(duration, mask):
|
129 |
+
"""
|
130 |
+
duration: [b, 1, t_x]
|
131 |
+
mask: [b, 1, t_y, t_x]
|
132 |
+
"""
|
133 |
+
|
134 |
+
b, _, t_y, t_x = mask.shape
|
135 |
+
cum_duration = torch.cumsum(duration, -1)
|
136 |
+
|
137 |
+
cum_duration_flat = cum_duration.view(b * t_x)
|
138 |
+
path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
|
139 |
+
path = path.view(b, t_x, t_y)
|
140 |
+
path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
|
141 |
+
path = path.unsqueeze(1).transpose(2, 3) * mask
|
142 |
+
return path
|
143 |
+
|
144 |
+
|
145 |
+
def clip_grad_value_(parameters, clip_value, norm_type=2):
|
146 |
+
if isinstance(parameters, torch.Tensor):
|
147 |
+
parameters = [parameters]
|
148 |
+
parameters = list(filter(lambda p: p.grad is not None, parameters))
|
149 |
+
norm_type = float(norm_type)
|
150 |
+
if clip_value is not None:
|
151 |
+
clip_value = float(clip_value)
|
152 |
+
|
153 |
+
total_norm = 0
|
154 |
+
for p in parameters:
|
155 |
+
param_norm = p.grad.data.norm(norm_type)
|
156 |
+
total_norm += param_norm.item() ** norm_type
|
157 |
+
if clip_value is not None:
|
158 |
+
p.grad.data.clamp_(min=-clip_value, max=clip_value)
|
159 |
+
total_norm = total_norm ** (1.0 / norm_type)
|
160 |
+
return total_norm
|
dreamvoice/train_utils/src/openvoice/mel_processing.py
ADDED
@@ -0,0 +1,183 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
import torch.utils.data
|
3 |
+
from librosa.filters import mel as librosa_mel_fn
|
4 |
+
|
5 |
+
MAX_WAV_VALUE = 32768.0
|
6 |
+
|
7 |
+
|
8 |
+
def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
|
9 |
+
"""
|
10 |
+
PARAMS
|
11 |
+
------
|
12 |
+
C: compression factor
|
13 |
+
"""
|
14 |
+
return torch.log(torch.clamp(x, min=clip_val) * C)
|
15 |
+
|
16 |
+
|
17 |
+
def dynamic_range_decompression_torch(x, C=1):
|
18 |
+
"""
|
19 |
+
PARAMS
|
20 |
+
------
|
21 |
+
C: compression factor used to compress
|
22 |
+
"""
|
23 |
+
return torch.exp(x) / C
|
24 |
+
|
25 |
+
|
26 |
+
def spectral_normalize_torch(magnitudes):
|
27 |
+
output = dynamic_range_compression_torch(magnitudes)
|
28 |
+
return output
|
29 |
+
|
30 |
+
|
31 |
+
def spectral_de_normalize_torch(magnitudes):
|
32 |
+
output = dynamic_range_decompression_torch(magnitudes)
|
33 |
+
return output
|
34 |
+
|
35 |
+
|
36 |
+
mel_basis = {}
|
37 |
+
hann_window = {}
|
38 |
+
|
39 |
+
|
40 |
+
def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
|
41 |
+
if torch.min(y) < -1.1:
|
42 |
+
print("min value is ", torch.min(y))
|
43 |
+
if torch.max(y) > 1.1:
|
44 |
+
print("max value is ", torch.max(y))
|
45 |
+
|
46 |
+
global hann_window
|
47 |
+
dtype_device = str(y.dtype) + "_" + str(y.device)
|
48 |
+
wnsize_dtype_device = str(win_size) + "_" + dtype_device
|
49 |
+
if wnsize_dtype_device not in hann_window:
|
50 |
+
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
|
51 |
+
dtype=y.dtype, device=y.device
|
52 |
+
)
|
53 |
+
|
54 |
+
y = torch.nn.functional.pad(
|
55 |
+
y.unsqueeze(1),
|
56 |
+
(int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
|
57 |
+
mode="reflect",
|
58 |
+
)
|
59 |
+
y = y.squeeze(1)
|
60 |
+
|
61 |
+
spec = torch.stft(
|
62 |
+
y,
|
63 |
+
n_fft,
|
64 |
+
hop_length=hop_size,
|
65 |
+
win_length=win_size,
|
66 |
+
window=hann_window[wnsize_dtype_device],
|
67 |
+
center=center,
|
68 |
+
pad_mode="reflect",
|
69 |
+
normalized=False,
|
70 |
+
onesided=True,
|
71 |
+
return_complex=False,
|
72 |
+
)
|
73 |
+
|
74 |
+
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
75 |
+
return spec
|
76 |
+
|
77 |
+
|
78 |
+
def spectrogram_torch_conv(y, n_fft, sampling_rate, hop_size, win_size, center=False):
|
79 |
+
# if torch.min(y) < -1.:
|
80 |
+
# print('min value is ', torch.min(y))
|
81 |
+
# if torch.max(y) > 1.:
|
82 |
+
# print('max value is ', torch.max(y))
|
83 |
+
|
84 |
+
global hann_window
|
85 |
+
dtype_device = str(y.dtype) + '_' + str(y.device)
|
86 |
+
wnsize_dtype_device = str(win_size) + '_' + dtype_device
|
87 |
+
if wnsize_dtype_device not in hann_window:
|
88 |
+
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
|
89 |
+
|
90 |
+
y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
|
91 |
+
|
92 |
+
# ******************** original ************************#
|
93 |
+
# y = y.squeeze(1)
|
94 |
+
# spec1 = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
95 |
+
# center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
|
96 |
+
|
97 |
+
# ******************** ConvSTFT ************************#
|
98 |
+
freq_cutoff = n_fft // 2 + 1
|
99 |
+
fourier_basis = torch.view_as_real(torch.fft.fft(torch.eye(n_fft)))
|
100 |
+
forward_basis = fourier_basis[:freq_cutoff].permute(2, 0, 1).reshape(-1, 1, fourier_basis.shape[1])
|
101 |
+
forward_basis = forward_basis * torch.as_tensor(librosa.util.pad_center(torch.hann_window(win_size), size=n_fft)).float()
|
102 |
+
|
103 |
+
import torch.nn.functional as F
|
104 |
+
|
105 |
+
# if center:
|
106 |
+
# signal = F.pad(y[:, None, None, :], (n_fft // 2, n_fft // 2, 0, 0), mode = 'reflect').squeeze(1)
|
107 |
+
assert center is False
|
108 |
+
|
109 |
+
forward_transform_squared = F.conv1d(y, forward_basis.to(y.device), stride = hop_size)
|
110 |
+
spec2 = torch.stack([forward_transform_squared[:, :freq_cutoff, :], forward_transform_squared[:, freq_cutoff:, :]], dim = -1)
|
111 |
+
|
112 |
+
|
113 |
+
# ******************** Verification ************************#
|
114 |
+
spec1 = torch.stft(y.squeeze(1), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
|
115 |
+
center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
|
116 |
+
assert torch.allclose(spec1, spec2, atol=1e-4)
|
117 |
+
|
118 |
+
spec = torch.sqrt(spec2.pow(2).sum(-1) + 1e-6)
|
119 |
+
return spec
|
120 |
+
|
121 |
+
|
122 |
+
def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
|
123 |
+
global mel_basis
|
124 |
+
dtype_device = str(spec.dtype) + "_" + str(spec.device)
|
125 |
+
fmax_dtype_device = str(fmax) + "_" + dtype_device
|
126 |
+
if fmax_dtype_device not in mel_basis:
|
127 |
+
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
128 |
+
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
|
129 |
+
dtype=spec.dtype, device=spec.device
|
130 |
+
)
|
131 |
+
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
132 |
+
spec = spectral_normalize_torch(spec)
|
133 |
+
return spec
|
134 |
+
|
135 |
+
|
136 |
+
def mel_spectrogram_torch(
|
137 |
+
y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
|
138 |
+
):
|
139 |
+
if torch.min(y) < -1.0:
|
140 |
+
print("min value is ", torch.min(y))
|
141 |
+
if torch.max(y) > 1.0:
|
142 |
+
print("max value is ", torch.max(y))
|
143 |
+
|
144 |
+
global mel_basis, hann_window
|
145 |
+
dtype_device = str(y.dtype) + "_" + str(y.device)
|
146 |
+
fmax_dtype_device = str(fmax) + "_" + dtype_device
|
147 |
+
wnsize_dtype_device = str(win_size) + "_" + dtype_device
|
148 |
+
if fmax_dtype_device not in mel_basis:
|
149 |
+
mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
|
150 |
+
mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
|
151 |
+
dtype=y.dtype, device=y.device
|
152 |
+
)
|
153 |
+
if wnsize_dtype_device not in hann_window:
|
154 |
+
hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
|
155 |
+
dtype=y.dtype, device=y.device
|
156 |
+
)
|
157 |
+
|
158 |
+
y = torch.nn.functional.pad(
|
159 |
+
y.unsqueeze(1),
|
160 |
+
(int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
|
161 |
+
mode="reflect",
|
162 |
+
)
|
163 |
+
y = y.squeeze(1)
|
164 |
+
|
165 |
+
spec = torch.stft(
|
166 |
+
y,
|
167 |
+
n_fft,
|
168 |
+
hop_length=hop_size,
|
169 |
+
win_length=win_size,
|
170 |
+
window=hann_window[wnsize_dtype_device],
|
171 |
+
center=center,
|
172 |
+
pad_mode="reflect",
|
173 |
+
normalized=False,
|
174 |
+
onesided=True,
|
175 |
+
return_complex=False,
|
176 |
+
)
|
177 |
+
|
178 |
+
spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
|
179 |
+
|
180 |
+
spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
|
181 |
+
spec = spectral_normalize_torch(spec)
|
182 |
+
|
183 |
+
return spec
|
dreamvoice/train_utils/src/openvoice/models.py
ADDED
@@ -0,0 +1,499 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from torch import nn
|
4 |
+
from torch.nn import functional as F
|
5 |
+
|
6 |
+
from openvoice import commons
|
7 |
+
from openvoice import modules
|
8 |
+
from openvoice import attentions
|
9 |
+
|
10 |
+
from torch.nn import Conv1d, ConvTranspose1d, Conv2d
|
11 |
+
from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
|
12 |
+
|
13 |
+
from openvoice.commons import init_weights, get_padding
|
14 |
+
|
15 |
+
|
16 |
+
class TextEncoder(nn.Module):
|
17 |
+
def __init__(self,
|
18 |
+
n_vocab,
|
19 |
+
out_channels,
|
20 |
+
hidden_channels,
|
21 |
+
filter_channels,
|
22 |
+
n_heads,
|
23 |
+
n_layers,
|
24 |
+
kernel_size,
|
25 |
+
p_dropout):
|
26 |
+
super().__init__()
|
27 |
+
self.n_vocab = n_vocab
|
28 |
+
self.out_channels = out_channels
|
29 |
+
self.hidden_channels = hidden_channels
|
30 |
+
self.filter_channels = filter_channels
|
31 |
+
self.n_heads = n_heads
|
32 |
+
self.n_layers = n_layers
|
33 |
+
self.kernel_size = kernel_size
|
34 |
+
self.p_dropout = p_dropout
|
35 |
+
|
36 |
+
self.emb = nn.Embedding(n_vocab, hidden_channels)
|
37 |
+
nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
|
38 |
+
|
39 |
+
self.encoder = attentions.Encoder(
|
40 |
+
hidden_channels,
|
41 |
+
filter_channels,
|
42 |
+
n_heads,
|
43 |
+
n_layers,
|
44 |
+
kernel_size,
|
45 |
+
p_dropout)
|
46 |
+
self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
47 |
+
|
48 |
+
def forward(self, x, x_lengths):
|
49 |
+
x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
|
50 |
+
x = torch.transpose(x, 1, -1) # [b, h, t]
|
51 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
|
52 |
+
|
53 |
+
x = self.encoder(x * x_mask, x_mask)
|
54 |
+
stats = self.proj(x) * x_mask
|
55 |
+
|
56 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
57 |
+
return x, m, logs, x_mask
|
58 |
+
|
59 |
+
|
60 |
+
class DurationPredictor(nn.Module):
|
61 |
+
def __init__(
|
62 |
+
self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
|
63 |
+
):
|
64 |
+
super().__init__()
|
65 |
+
|
66 |
+
self.in_channels = in_channels
|
67 |
+
self.filter_channels = filter_channels
|
68 |
+
self.kernel_size = kernel_size
|
69 |
+
self.p_dropout = p_dropout
|
70 |
+
self.gin_channels = gin_channels
|
71 |
+
|
72 |
+
self.drop = nn.Dropout(p_dropout)
|
73 |
+
self.conv_1 = nn.Conv1d(
|
74 |
+
in_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
75 |
+
)
|
76 |
+
self.norm_1 = modules.LayerNorm(filter_channels)
|
77 |
+
self.conv_2 = nn.Conv1d(
|
78 |
+
filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
|
79 |
+
)
|
80 |
+
self.norm_2 = modules.LayerNorm(filter_channels)
|
81 |
+
self.proj = nn.Conv1d(filter_channels, 1, 1)
|
82 |
+
|
83 |
+
if gin_channels != 0:
|
84 |
+
self.cond = nn.Conv1d(gin_channels, in_channels, 1)
|
85 |
+
|
86 |
+
def forward(self, x, x_mask, g=None):
|
87 |
+
x = torch.detach(x)
|
88 |
+
if g is not None:
|
89 |
+
g = torch.detach(g)
|
90 |
+
x = x + self.cond(g)
|
91 |
+
x = self.conv_1(x * x_mask)
|
92 |
+
x = torch.relu(x)
|
93 |
+
x = self.norm_1(x)
|
94 |
+
x = self.drop(x)
|
95 |
+
x = self.conv_2(x * x_mask)
|
96 |
+
x = torch.relu(x)
|
97 |
+
x = self.norm_2(x)
|
98 |
+
x = self.drop(x)
|
99 |
+
x = self.proj(x * x_mask)
|
100 |
+
return x * x_mask
|
101 |
+
|
102 |
+
class StochasticDurationPredictor(nn.Module):
|
103 |
+
def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
|
104 |
+
super().__init__()
|
105 |
+
filter_channels = in_channels # it needs to be removed from future version.
|
106 |
+
self.in_channels = in_channels
|
107 |
+
self.filter_channels = filter_channels
|
108 |
+
self.kernel_size = kernel_size
|
109 |
+
self.p_dropout = p_dropout
|
110 |
+
self.n_flows = n_flows
|
111 |
+
self.gin_channels = gin_channels
|
112 |
+
|
113 |
+
self.log_flow = modules.Log()
|
114 |
+
self.flows = nn.ModuleList()
|
115 |
+
self.flows.append(modules.ElementwiseAffine(2))
|
116 |
+
for i in range(n_flows):
|
117 |
+
self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
|
118 |
+
self.flows.append(modules.Flip())
|
119 |
+
|
120 |
+
self.post_pre = nn.Conv1d(1, filter_channels, 1)
|
121 |
+
self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
122 |
+
self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
|
123 |
+
self.post_flows = nn.ModuleList()
|
124 |
+
self.post_flows.append(modules.ElementwiseAffine(2))
|
125 |
+
for i in range(4):
|
126 |
+
self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
|
127 |
+
self.post_flows.append(modules.Flip())
|
128 |
+
|
129 |
+
self.pre = nn.Conv1d(in_channels, filter_channels, 1)
|
130 |
+
self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
|
131 |
+
self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
|
132 |
+
if gin_channels != 0:
|
133 |
+
self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
|
134 |
+
|
135 |
+
def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
|
136 |
+
x = torch.detach(x)
|
137 |
+
x = self.pre(x)
|
138 |
+
if g is not None:
|
139 |
+
g = torch.detach(g)
|
140 |
+
x = x + self.cond(g)
|
141 |
+
x = self.convs(x, x_mask)
|
142 |
+
x = self.proj(x) * x_mask
|
143 |
+
|
144 |
+
if not reverse:
|
145 |
+
flows = self.flows
|
146 |
+
assert w is not None
|
147 |
+
|
148 |
+
logdet_tot_q = 0
|
149 |
+
h_w = self.post_pre(w)
|
150 |
+
h_w = self.post_convs(h_w, x_mask)
|
151 |
+
h_w = self.post_proj(h_w) * x_mask
|
152 |
+
e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
|
153 |
+
z_q = e_q
|
154 |
+
for flow in self.post_flows:
|
155 |
+
z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
|
156 |
+
logdet_tot_q += logdet_q
|
157 |
+
z_u, z1 = torch.split(z_q, [1, 1], 1)
|
158 |
+
u = torch.sigmoid(z_u) * x_mask
|
159 |
+
z0 = (w - u) * x_mask
|
160 |
+
logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
|
161 |
+
logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
|
162 |
+
|
163 |
+
logdet_tot = 0
|
164 |
+
z0, logdet = self.log_flow(z0, x_mask)
|
165 |
+
logdet_tot += logdet
|
166 |
+
z = torch.cat([z0, z1], 1)
|
167 |
+
for flow in flows:
|
168 |
+
z, logdet = flow(z, x_mask, g=x, reverse=reverse)
|
169 |
+
logdet_tot = logdet_tot + logdet
|
170 |
+
nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
|
171 |
+
return nll + logq # [b]
|
172 |
+
else:
|
173 |
+
flows = list(reversed(self.flows))
|
174 |
+
flows = flows[:-2] + [flows[-1]] # remove a useless vflow
|
175 |
+
z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
|
176 |
+
for flow in flows:
|
177 |
+
z = flow(z, x_mask, g=x, reverse=reverse)
|
178 |
+
z0, z1 = torch.split(z, [1, 1], 1)
|
179 |
+
logw = z0
|
180 |
+
return logw
|
181 |
+
|
182 |
+
class PosteriorEncoder(nn.Module):
|
183 |
+
def __init__(
|
184 |
+
self,
|
185 |
+
in_channels,
|
186 |
+
out_channels,
|
187 |
+
hidden_channels,
|
188 |
+
kernel_size,
|
189 |
+
dilation_rate,
|
190 |
+
n_layers,
|
191 |
+
gin_channels=0,
|
192 |
+
):
|
193 |
+
super().__init__()
|
194 |
+
self.in_channels = in_channels
|
195 |
+
self.out_channels = out_channels
|
196 |
+
self.hidden_channels = hidden_channels
|
197 |
+
self.kernel_size = kernel_size
|
198 |
+
self.dilation_rate = dilation_rate
|
199 |
+
self.n_layers = n_layers
|
200 |
+
self.gin_channels = gin_channels
|
201 |
+
|
202 |
+
self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
|
203 |
+
self.enc = modules.WN(
|
204 |
+
hidden_channels,
|
205 |
+
kernel_size,
|
206 |
+
dilation_rate,
|
207 |
+
n_layers,
|
208 |
+
gin_channels=gin_channels,
|
209 |
+
)
|
210 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
|
211 |
+
|
212 |
+
def forward(self, x, x_lengths, g=None, tau=1.0):
|
213 |
+
x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
|
214 |
+
x.dtype
|
215 |
+
)
|
216 |
+
x = self.pre(x) * x_mask
|
217 |
+
x = self.enc(x, x_mask, g=g)
|
218 |
+
stats = self.proj(x) * x_mask
|
219 |
+
m, logs = torch.split(stats, self.out_channels, dim=1)
|
220 |
+
z = (m + torch.randn_like(m) * tau * torch.exp(logs)) * x_mask
|
221 |
+
return z, m, logs, x_mask
|
222 |
+
|
223 |
+
|
224 |
+
class Generator(torch.nn.Module):
|
225 |
+
def __init__(
|
226 |
+
self,
|
227 |
+
initial_channel,
|
228 |
+
resblock,
|
229 |
+
resblock_kernel_sizes,
|
230 |
+
resblock_dilation_sizes,
|
231 |
+
upsample_rates,
|
232 |
+
upsample_initial_channel,
|
233 |
+
upsample_kernel_sizes,
|
234 |
+
gin_channels=0,
|
235 |
+
):
|
236 |
+
super(Generator, self).__init__()
|
237 |
+
self.num_kernels = len(resblock_kernel_sizes)
|
238 |
+
self.num_upsamples = len(upsample_rates)
|
239 |
+
self.conv_pre = Conv1d(
|
240 |
+
initial_channel, upsample_initial_channel, 7, 1, padding=3
|
241 |
+
)
|
242 |
+
resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
|
243 |
+
|
244 |
+
self.ups = nn.ModuleList()
|
245 |
+
for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
|
246 |
+
self.ups.append(
|
247 |
+
weight_norm(
|
248 |
+
ConvTranspose1d(
|
249 |
+
upsample_initial_channel // (2**i),
|
250 |
+
upsample_initial_channel // (2 ** (i + 1)),
|
251 |
+
k,
|
252 |
+
u,
|
253 |
+
padding=(k - u) // 2,
|
254 |
+
)
|
255 |
+
)
|
256 |
+
)
|
257 |
+
|
258 |
+
self.resblocks = nn.ModuleList()
|
259 |
+
for i in range(len(self.ups)):
|
260 |
+
ch = upsample_initial_channel // (2 ** (i + 1))
|
261 |
+
for j, (k, d) in enumerate(
|
262 |
+
zip(resblock_kernel_sizes, resblock_dilation_sizes)
|
263 |
+
):
|
264 |
+
self.resblocks.append(resblock(ch, k, d))
|
265 |
+
|
266 |
+
self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
|
267 |
+
self.ups.apply(init_weights)
|
268 |
+
|
269 |
+
if gin_channels != 0:
|
270 |
+
self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
|
271 |
+
|
272 |
+
def forward(self, x, g=None):
|
273 |
+
x = self.conv_pre(x)
|
274 |
+
if g is not None:
|
275 |
+
x = x + self.cond(g)
|
276 |
+
|
277 |
+
for i in range(self.num_upsamples):
|
278 |
+
x = F.leaky_relu(x, modules.LRELU_SLOPE)
|
279 |
+
x = self.ups[i](x)
|
280 |
+
xs = None
|
281 |
+
for j in range(self.num_kernels):
|
282 |
+
if xs is None:
|
283 |
+
xs = self.resblocks[i * self.num_kernels + j](x)
|
284 |
+
else:
|
285 |
+
xs += self.resblocks[i * self.num_kernels + j](x)
|
286 |
+
x = xs / self.num_kernels
|
287 |
+
x = F.leaky_relu(x)
|
288 |
+
x = self.conv_post(x)
|
289 |
+
x = torch.tanh(x)
|
290 |
+
|
291 |
+
return x
|
292 |
+
|
293 |
+
def remove_weight_norm(self):
|
294 |
+
print("Removing weight norm...")
|
295 |
+
for layer in self.ups:
|
296 |
+
remove_weight_norm(layer)
|
297 |
+
for layer in self.resblocks:
|
298 |
+
layer.remove_weight_norm()
|
299 |
+
|
300 |
+
|
301 |
+
class ReferenceEncoder(nn.Module):
|
302 |
+
"""
|
303 |
+
inputs --- [N, Ty/r, n_mels*r] mels
|
304 |
+
outputs --- [N, ref_enc_gru_size]
|
305 |
+
"""
|
306 |
+
|
307 |
+
def __init__(self, spec_channels, gin_channels=0, layernorm=True):
|
308 |
+
super().__init__()
|
309 |
+
self.spec_channels = spec_channels
|
310 |
+
ref_enc_filters = [32, 32, 64, 64, 128, 128]
|
311 |
+
K = len(ref_enc_filters)
|
312 |
+
filters = [1] + ref_enc_filters
|
313 |
+
convs = [
|
314 |
+
weight_norm(
|
315 |
+
nn.Conv2d(
|
316 |
+
in_channels=filters[i],
|
317 |
+
out_channels=filters[i + 1],
|
318 |
+
kernel_size=(3, 3),
|
319 |
+
stride=(2, 2),
|
320 |
+
padding=(1, 1),
|
321 |
+
)
|
322 |
+
)
|
323 |
+
for i in range(K)
|
324 |
+
]
|
325 |
+
self.convs = nn.ModuleList(convs)
|
326 |
+
|
327 |
+
out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
|
328 |
+
self.gru = nn.GRU(
|
329 |
+
input_size=ref_enc_filters[-1] * out_channels,
|
330 |
+
hidden_size=256 // 2,
|
331 |
+
batch_first=True,
|
332 |
+
)
|
333 |
+
self.proj = nn.Linear(128, gin_channels)
|
334 |
+
if layernorm:
|
335 |
+
self.layernorm = nn.LayerNorm(self.spec_channels)
|
336 |
+
else:
|
337 |
+
self.layernorm = None
|
338 |
+
|
339 |
+
def forward(self, inputs, mask=None):
|
340 |
+
N = inputs.size(0)
|
341 |
+
|
342 |
+
out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
|
343 |
+
if self.layernorm is not None:
|
344 |
+
out = self.layernorm(out)
|
345 |
+
|
346 |
+
for conv in self.convs:
|
347 |
+
out = conv(out)
|
348 |
+
# out = wn(out)
|
349 |
+
out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
|
350 |
+
|
351 |
+
out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
|
352 |
+
T = out.size(1)
|
353 |
+
N = out.size(0)
|
354 |
+
out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
|
355 |
+
|
356 |
+
self.gru.flatten_parameters()
|
357 |
+
memory, out = self.gru(out) # out --- [1, N, 128]
|
358 |
+
|
359 |
+
return self.proj(out.squeeze(0))
|
360 |
+
|
361 |
+
def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
|
362 |
+
for i in range(n_convs):
|
363 |
+
L = (L - kernel_size + 2 * pad) // stride + 1
|
364 |
+
return L
|
365 |
+
|
366 |
+
|
367 |
+
class ResidualCouplingBlock(nn.Module):
|
368 |
+
def __init__(self,
|
369 |
+
channels,
|
370 |
+
hidden_channels,
|
371 |
+
kernel_size,
|
372 |
+
dilation_rate,
|
373 |
+
n_layers,
|
374 |
+
n_flows=4,
|
375 |
+
gin_channels=0):
|
376 |
+
super().__init__()
|
377 |
+
self.channels = channels
|
378 |
+
self.hidden_channels = hidden_channels
|
379 |
+
self.kernel_size = kernel_size
|
380 |
+
self.dilation_rate = dilation_rate
|
381 |
+
self.n_layers = n_layers
|
382 |
+
self.n_flows = n_flows
|
383 |
+
self.gin_channels = gin_channels
|
384 |
+
|
385 |
+
self.flows = nn.ModuleList()
|
386 |
+
for i in range(n_flows):
|
387 |
+
self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
|
388 |
+
self.flows.append(modules.Flip())
|
389 |
+
|
390 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
391 |
+
if not reverse:
|
392 |
+
for flow in self.flows:
|
393 |
+
x, _ = flow(x, x_mask, g=g, reverse=reverse)
|
394 |
+
else:
|
395 |
+
for flow in reversed(self.flows):
|
396 |
+
x = flow(x, x_mask, g=g, reverse=reverse)
|
397 |
+
return x
|
398 |
+
|
399 |
+
class SynthesizerTrn(nn.Module):
|
400 |
+
"""
|
401 |
+
Synthesizer for Training
|
402 |
+
"""
|
403 |
+
|
404 |
+
def __init__(
|
405 |
+
self,
|
406 |
+
n_vocab,
|
407 |
+
spec_channels,
|
408 |
+
inter_channels,
|
409 |
+
hidden_channels,
|
410 |
+
filter_channels,
|
411 |
+
n_heads,
|
412 |
+
n_layers,
|
413 |
+
kernel_size,
|
414 |
+
p_dropout,
|
415 |
+
resblock,
|
416 |
+
resblock_kernel_sizes,
|
417 |
+
resblock_dilation_sizes,
|
418 |
+
upsample_rates,
|
419 |
+
upsample_initial_channel,
|
420 |
+
upsample_kernel_sizes,
|
421 |
+
n_speakers=256,
|
422 |
+
gin_channels=256,
|
423 |
+
zero_g=False,
|
424 |
+
**kwargs
|
425 |
+
):
|
426 |
+
super().__init__()
|
427 |
+
|
428 |
+
self.dec = Generator(
|
429 |
+
inter_channels,
|
430 |
+
resblock,
|
431 |
+
resblock_kernel_sizes,
|
432 |
+
resblock_dilation_sizes,
|
433 |
+
upsample_rates,
|
434 |
+
upsample_initial_channel,
|
435 |
+
upsample_kernel_sizes,
|
436 |
+
gin_channels=gin_channels,
|
437 |
+
)
|
438 |
+
self.enc_q = PosteriorEncoder(
|
439 |
+
spec_channels,
|
440 |
+
inter_channels,
|
441 |
+
hidden_channels,
|
442 |
+
5,
|
443 |
+
1,
|
444 |
+
16,
|
445 |
+
gin_channels=gin_channels,
|
446 |
+
)
|
447 |
+
|
448 |
+
self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
|
449 |
+
|
450 |
+
self.n_speakers = n_speakers
|
451 |
+
if n_speakers == 0:
|
452 |
+
self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
|
453 |
+
else:
|
454 |
+
self.enc_p = TextEncoder(n_vocab,
|
455 |
+
inter_channels,
|
456 |
+
hidden_channels,
|
457 |
+
filter_channels,
|
458 |
+
n_heads,
|
459 |
+
n_layers,
|
460 |
+
kernel_size,
|
461 |
+
p_dropout)
|
462 |
+
self.sdp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
|
463 |
+
self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
|
464 |
+
self.emb_g = nn.Embedding(n_speakers, gin_channels)
|
465 |
+
self.zero_g = zero_g
|
466 |
+
|
467 |
+
def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., sdp_ratio=0.2, max_len=None):
|
468 |
+
x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
|
469 |
+
if self.n_speakers > 0:
|
470 |
+
g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
|
471 |
+
else:
|
472 |
+
g = None
|
473 |
+
|
474 |
+
logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * sdp_ratio \
|
475 |
+
+ self.dp(x, x_mask, g=g) * (1 - sdp_ratio)
|
476 |
+
|
477 |
+
w = torch.exp(logw) * x_mask * length_scale
|
478 |
+
w_ceil = torch.ceil(w)
|
479 |
+
y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
|
480 |
+
y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
|
481 |
+
attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
|
482 |
+
attn = commons.generate_path(w_ceil, attn_mask)
|
483 |
+
|
484 |
+
m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
485 |
+
logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
|
486 |
+
|
487 |
+
z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
|
488 |
+
z = self.flow(z_p, y_mask, g=g, reverse=True)
|
489 |
+
o = self.dec((z * y_mask)[:,:,:max_len], g=g)
|
490 |
+
return o, attn, y_mask, (z, z_p, m_p, logs_p)
|
491 |
+
|
492 |
+
def voice_conversion(self, y, y_lengths, sid_src, sid_tgt, tau=1.0):
|
493 |
+
g_src = sid_src
|
494 |
+
g_tgt = sid_tgt
|
495 |
+
z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src if not self.zero_g else torch.zeros_like(g_src), tau=tau)
|
496 |
+
z_p = self.flow(z, y_mask, g=g_src)
|
497 |
+
z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
|
498 |
+
o_hat = self.dec(z_hat * y_mask, g=g_tgt if not self.zero_g else torch.zeros_like(g_tgt))
|
499 |
+
return o_hat, y_mask, (z, z_p, z_hat)
|
dreamvoice/train_utils/src/openvoice/modules.py
ADDED
@@ -0,0 +1,598 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import math
|
2 |
+
import torch
|
3 |
+
from torch import nn
|
4 |
+
from torch.nn import functional as F
|
5 |
+
|
6 |
+
from torch.nn import Conv1d
|
7 |
+
from torch.nn.utils import weight_norm, remove_weight_norm
|
8 |
+
|
9 |
+
from openvoice import commons
|
10 |
+
from openvoice.commons import init_weights, get_padding
|
11 |
+
from openvoice.transforms import piecewise_rational_quadratic_transform
|
12 |
+
from openvoice.attentions import Encoder
|
13 |
+
|
14 |
+
LRELU_SLOPE = 0.1
|
15 |
+
|
16 |
+
|
17 |
+
class LayerNorm(nn.Module):
|
18 |
+
def __init__(self, channels, eps=1e-5):
|
19 |
+
super().__init__()
|
20 |
+
self.channels = channels
|
21 |
+
self.eps = eps
|
22 |
+
|
23 |
+
self.gamma = nn.Parameter(torch.ones(channels))
|
24 |
+
self.beta = nn.Parameter(torch.zeros(channels))
|
25 |
+
|
26 |
+
def forward(self, x):
|
27 |
+
x = x.transpose(1, -1)
|
28 |
+
x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
|
29 |
+
return x.transpose(1, -1)
|
30 |
+
|
31 |
+
|
32 |
+
class ConvReluNorm(nn.Module):
|
33 |
+
def __init__(
|
34 |
+
self,
|
35 |
+
in_channels,
|
36 |
+
hidden_channels,
|
37 |
+
out_channels,
|
38 |
+
kernel_size,
|
39 |
+
n_layers,
|
40 |
+
p_dropout,
|
41 |
+
):
|
42 |
+
super().__init__()
|
43 |
+
self.in_channels = in_channels
|
44 |
+
self.hidden_channels = hidden_channels
|
45 |
+
self.out_channels = out_channels
|
46 |
+
self.kernel_size = kernel_size
|
47 |
+
self.n_layers = n_layers
|
48 |
+
self.p_dropout = p_dropout
|
49 |
+
assert n_layers > 1, "Number of layers should be larger than 0."
|
50 |
+
|
51 |
+
self.conv_layers = nn.ModuleList()
|
52 |
+
self.norm_layers = nn.ModuleList()
|
53 |
+
self.conv_layers.append(
|
54 |
+
nn.Conv1d(
|
55 |
+
in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
|
56 |
+
)
|
57 |
+
)
|
58 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
59 |
+
self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
|
60 |
+
for _ in range(n_layers - 1):
|
61 |
+
self.conv_layers.append(
|
62 |
+
nn.Conv1d(
|
63 |
+
hidden_channels,
|
64 |
+
hidden_channels,
|
65 |
+
kernel_size,
|
66 |
+
padding=kernel_size // 2,
|
67 |
+
)
|
68 |
+
)
|
69 |
+
self.norm_layers.append(LayerNorm(hidden_channels))
|
70 |
+
self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
|
71 |
+
self.proj.weight.data.zero_()
|
72 |
+
self.proj.bias.data.zero_()
|
73 |
+
|
74 |
+
def forward(self, x, x_mask):
|
75 |
+
x_org = x
|
76 |
+
for i in range(self.n_layers):
|
77 |
+
x = self.conv_layers[i](x * x_mask)
|
78 |
+
x = self.norm_layers[i](x)
|
79 |
+
x = self.relu_drop(x)
|
80 |
+
x = x_org + self.proj(x)
|
81 |
+
return x * x_mask
|
82 |
+
|
83 |
+
|
84 |
+
class DDSConv(nn.Module):
|
85 |
+
"""
|
86 |
+
Dilated and Depth-Separable Convolution
|
87 |
+
"""
|
88 |
+
|
89 |
+
def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
|
90 |
+
super().__init__()
|
91 |
+
self.channels = channels
|
92 |
+
self.kernel_size = kernel_size
|
93 |
+
self.n_layers = n_layers
|
94 |
+
self.p_dropout = p_dropout
|
95 |
+
|
96 |
+
self.drop = nn.Dropout(p_dropout)
|
97 |
+
self.convs_sep = nn.ModuleList()
|
98 |
+
self.convs_1x1 = nn.ModuleList()
|
99 |
+
self.norms_1 = nn.ModuleList()
|
100 |
+
self.norms_2 = nn.ModuleList()
|
101 |
+
for i in range(n_layers):
|
102 |
+
dilation = kernel_size**i
|
103 |
+
padding = (kernel_size * dilation - dilation) // 2
|
104 |
+
self.convs_sep.append(
|
105 |
+
nn.Conv1d(
|
106 |
+
channels,
|
107 |
+
channels,
|
108 |
+
kernel_size,
|
109 |
+
groups=channels,
|
110 |
+
dilation=dilation,
|
111 |
+
padding=padding,
|
112 |
+
)
|
113 |
+
)
|
114 |
+
self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
|
115 |
+
self.norms_1.append(LayerNorm(channels))
|
116 |
+
self.norms_2.append(LayerNorm(channels))
|
117 |
+
|
118 |
+
def forward(self, x, x_mask, g=None):
|
119 |
+
if g is not None:
|
120 |
+
x = x + g
|
121 |
+
for i in range(self.n_layers):
|
122 |
+
y = self.convs_sep[i](x * x_mask)
|
123 |
+
y = self.norms_1[i](y)
|
124 |
+
y = F.gelu(y)
|
125 |
+
y = self.convs_1x1[i](y)
|
126 |
+
y = self.norms_2[i](y)
|
127 |
+
y = F.gelu(y)
|
128 |
+
y = self.drop(y)
|
129 |
+
x = x + y
|
130 |
+
return x * x_mask
|
131 |
+
|
132 |
+
|
133 |
+
class WN(torch.nn.Module):
|
134 |
+
def __init__(
|
135 |
+
self,
|
136 |
+
hidden_channels,
|
137 |
+
kernel_size,
|
138 |
+
dilation_rate,
|
139 |
+
n_layers,
|
140 |
+
gin_channels=0,
|
141 |
+
p_dropout=0,
|
142 |
+
):
|
143 |
+
super(WN, self).__init__()
|
144 |
+
assert kernel_size % 2 == 1
|
145 |
+
self.hidden_channels = hidden_channels
|
146 |
+
self.kernel_size = (kernel_size,)
|
147 |
+
self.dilation_rate = dilation_rate
|
148 |
+
self.n_layers = n_layers
|
149 |
+
self.gin_channels = gin_channels
|
150 |
+
self.p_dropout = p_dropout
|
151 |
+
|
152 |
+
self.in_layers = torch.nn.ModuleList()
|
153 |
+
self.res_skip_layers = torch.nn.ModuleList()
|
154 |
+
self.drop = nn.Dropout(p_dropout)
|
155 |
+
|
156 |
+
if gin_channels != 0:
|
157 |
+
cond_layer = torch.nn.Conv1d(
|
158 |
+
gin_channels, 2 * hidden_channels * n_layers, 1
|
159 |
+
)
|
160 |
+
self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
|
161 |
+
|
162 |
+
for i in range(n_layers):
|
163 |
+
dilation = dilation_rate**i
|
164 |
+
padding = int((kernel_size * dilation - dilation) / 2)
|
165 |
+
in_layer = torch.nn.Conv1d(
|
166 |
+
hidden_channels,
|
167 |
+
2 * hidden_channels,
|
168 |
+
kernel_size,
|
169 |
+
dilation=dilation,
|
170 |
+
padding=padding,
|
171 |
+
)
|
172 |
+
in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
|
173 |
+
self.in_layers.append(in_layer)
|
174 |
+
|
175 |
+
# last one is not necessary
|
176 |
+
if i < n_layers - 1:
|
177 |
+
res_skip_channels = 2 * hidden_channels
|
178 |
+
else:
|
179 |
+
res_skip_channels = hidden_channels
|
180 |
+
|
181 |
+
res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
|
182 |
+
res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
|
183 |
+
self.res_skip_layers.append(res_skip_layer)
|
184 |
+
|
185 |
+
def forward(self, x, x_mask, g=None, **kwargs):
|
186 |
+
output = torch.zeros_like(x)
|
187 |
+
n_channels_tensor = torch.IntTensor([self.hidden_channels])
|
188 |
+
|
189 |
+
if g is not None:
|
190 |
+
g = self.cond_layer(g)
|
191 |
+
|
192 |
+
for i in range(self.n_layers):
|
193 |
+
x_in = self.in_layers[i](x)
|
194 |
+
if g is not None:
|
195 |
+
cond_offset = i * 2 * self.hidden_channels
|
196 |
+
g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
|
197 |
+
else:
|
198 |
+
g_l = torch.zeros_like(x_in)
|
199 |
+
|
200 |
+
acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
|
201 |
+
acts = self.drop(acts)
|
202 |
+
|
203 |
+
res_skip_acts = self.res_skip_layers[i](acts)
|
204 |
+
if i < self.n_layers - 1:
|
205 |
+
res_acts = res_skip_acts[:, : self.hidden_channels, :]
|
206 |
+
x = (x + res_acts) * x_mask
|
207 |
+
output = output + res_skip_acts[:, self.hidden_channels :, :]
|
208 |
+
else:
|
209 |
+
output = output + res_skip_acts
|
210 |
+
return output * x_mask
|
211 |
+
|
212 |
+
def remove_weight_norm(self):
|
213 |
+
if self.gin_channels != 0:
|
214 |
+
torch.nn.utils.remove_weight_norm(self.cond_layer)
|
215 |
+
for l in self.in_layers:
|
216 |
+
torch.nn.utils.remove_weight_norm(l)
|
217 |
+
for l in self.res_skip_layers:
|
218 |
+
torch.nn.utils.remove_weight_norm(l)
|
219 |
+
|
220 |
+
|
221 |
+
class ResBlock1(torch.nn.Module):
|
222 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
|
223 |
+
super(ResBlock1, self).__init__()
|
224 |
+
self.convs1 = nn.ModuleList(
|
225 |
+
[
|
226 |
+
weight_norm(
|
227 |
+
Conv1d(
|
228 |
+
channels,
|
229 |
+
channels,
|
230 |
+
kernel_size,
|
231 |
+
1,
|
232 |
+
dilation=dilation[0],
|
233 |
+
padding=get_padding(kernel_size, dilation[0]),
|
234 |
+
)
|
235 |
+
),
|
236 |
+
weight_norm(
|
237 |
+
Conv1d(
|
238 |
+
channels,
|
239 |
+
channels,
|
240 |
+
kernel_size,
|
241 |
+
1,
|
242 |
+
dilation=dilation[1],
|
243 |
+
padding=get_padding(kernel_size, dilation[1]),
|
244 |
+
)
|
245 |
+
),
|
246 |
+
weight_norm(
|
247 |
+
Conv1d(
|
248 |
+
channels,
|
249 |
+
channels,
|
250 |
+
kernel_size,
|
251 |
+
1,
|
252 |
+
dilation=dilation[2],
|
253 |
+
padding=get_padding(kernel_size, dilation[2]),
|
254 |
+
)
|
255 |
+
),
|
256 |
+
]
|
257 |
+
)
|
258 |
+
self.convs1.apply(init_weights)
|
259 |
+
|
260 |
+
self.convs2 = nn.ModuleList(
|
261 |
+
[
|
262 |
+
weight_norm(
|
263 |
+
Conv1d(
|
264 |
+
channels,
|
265 |
+
channels,
|
266 |
+
kernel_size,
|
267 |
+
1,
|
268 |
+
dilation=1,
|
269 |
+
padding=get_padding(kernel_size, 1),
|
270 |
+
)
|
271 |
+
),
|
272 |
+
weight_norm(
|
273 |
+
Conv1d(
|
274 |
+
channels,
|
275 |
+
channels,
|
276 |
+
kernel_size,
|
277 |
+
1,
|
278 |
+
dilation=1,
|
279 |
+
padding=get_padding(kernel_size, 1),
|
280 |
+
)
|
281 |
+
),
|
282 |
+
weight_norm(
|
283 |
+
Conv1d(
|
284 |
+
channels,
|
285 |
+
channels,
|
286 |
+
kernel_size,
|
287 |
+
1,
|
288 |
+
dilation=1,
|
289 |
+
padding=get_padding(kernel_size, 1),
|
290 |
+
)
|
291 |
+
),
|
292 |
+
]
|
293 |
+
)
|
294 |
+
self.convs2.apply(init_weights)
|
295 |
+
|
296 |
+
def forward(self, x, x_mask=None):
|
297 |
+
for c1, c2 in zip(self.convs1, self.convs2):
|
298 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
299 |
+
if x_mask is not None:
|
300 |
+
xt = xt * x_mask
|
301 |
+
xt = c1(xt)
|
302 |
+
xt = F.leaky_relu(xt, LRELU_SLOPE)
|
303 |
+
if x_mask is not None:
|
304 |
+
xt = xt * x_mask
|
305 |
+
xt = c2(xt)
|
306 |
+
x = xt + x
|
307 |
+
if x_mask is not None:
|
308 |
+
x = x * x_mask
|
309 |
+
return x
|
310 |
+
|
311 |
+
def remove_weight_norm(self):
|
312 |
+
for l in self.convs1:
|
313 |
+
remove_weight_norm(l)
|
314 |
+
for l in self.convs2:
|
315 |
+
remove_weight_norm(l)
|
316 |
+
|
317 |
+
|
318 |
+
class ResBlock2(torch.nn.Module):
|
319 |
+
def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
|
320 |
+
super(ResBlock2, self).__init__()
|
321 |
+
self.convs = nn.ModuleList(
|
322 |
+
[
|
323 |
+
weight_norm(
|
324 |
+
Conv1d(
|
325 |
+
channels,
|
326 |
+
channels,
|
327 |
+
kernel_size,
|
328 |
+
1,
|
329 |
+
dilation=dilation[0],
|
330 |
+
padding=get_padding(kernel_size, dilation[0]),
|
331 |
+
)
|
332 |
+
),
|
333 |
+
weight_norm(
|
334 |
+
Conv1d(
|
335 |
+
channels,
|
336 |
+
channels,
|
337 |
+
kernel_size,
|
338 |
+
1,
|
339 |
+
dilation=dilation[1],
|
340 |
+
padding=get_padding(kernel_size, dilation[1]),
|
341 |
+
)
|
342 |
+
),
|
343 |
+
]
|
344 |
+
)
|
345 |
+
self.convs.apply(init_weights)
|
346 |
+
|
347 |
+
def forward(self, x, x_mask=None):
|
348 |
+
for c in self.convs:
|
349 |
+
xt = F.leaky_relu(x, LRELU_SLOPE)
|
350 |
+
if x_mask is not None:
|
351 |
+
xt = xt * x_mask
|
352 |
+
xt = c(xt)
|
353 |
+
x = xt + x
|
354 |
+
if x_mask is not None:
|
355 |
+
x = x * x_mask
|
356 |
+
return x
|
357 |
+
|
358 |
+
def remove_weight_norm(self):
|
359 |
+
for l in self.convs:
|
360 |
+
remove_weight_norm(l)
|
361 |
+
|
362 |
+
|
363 |
+
class Log(nn.Module):
|
364 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
365 |
+
if not reverse:
|
366 |
+
y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
|
367 |
+
logdet = torch.sum(-y, [1, 2])
|
368 |
+
return y, logdet
|
369 |
+
else:
|
370 |
+
x = torch.exp(x) * x_mask
|
371 |
+
return x
|
372 |
+
|
373 |
+
|
374 |
+
class Flip(nn.Module):
|
375 |
+
def forward(self, x, *args, reverse=False, **kwargs):
|
376 |
+
x = torch.flip(x, [1])
|
377 |
+
if not reverse:
|
378 |
+
logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
|
379 |
+
return x, logdet
|
380 |
+
else:
|
381 |
+
return x
|
382 |
+
|
383 |
+
|
384 |
+
class ElementwiseAffine(nn.Module):
|
385 |
+
def __init__(self, channels):
|
386 |
+
super().__init__()
|
387 |
+
self.channels = channels
|
388 |
+
self.m = nn.Parameter(torch.zeros(channels, 1))
|
389 |
+
self.logs = nn.Parameter(torch.zeros(channels, 1))
|
390 |
+
|
391 |
+
def forward(self, x, x_mask, reverse=False, **kwargs):
|
392 |
+
if not reverse:
|
393 |
+
y = self.m + torch.exp(self.logs) * x
|
394 |
+
y = y * x_mask
|
395 |
+
logdet = torch.sum(self.logs * x_mask, [1, 2])
|
396 |
+
return y, logdet
|
397 |
+
else:
|
398 |
+
x = (x - self.m) * torch.exp(-self.logs) * x_mask
|
399 |
+
return x
|
400 |
+
|
401 |
+
|
402 |
+
class ResidualCouplingLayer(nn.Module):
|
403 |
+
def __init__(
|
404 |
+
self,
|
405 |
+
channels,
|
406 |
+
hidden_channels,
|
407 |
+
kernel_size,
|
408 |
+
dilation_rate,
|
409 |
+
n_layers,
|
410 |
+
p_dropout=0,
|
411 |
+
gin_channels=0,
|
412 |
+
mean_only=False,
|
413 |
+
):
|
414 |
+
assert channels % 2 == 0, "channels should be divisible by 2"
|
415 |
+
super().__init__()
|
416 |
+
self.channels = channels
|
417 |
+
self.hidden_channels = hidden_channels
|
418 |
+
self.kernel_size = kernel_size
|
419 |
+
self.dilation_rate = dilation_rate
|
420 |
+
self.n_layers = n_layers
|
421 |
+
self.half_channels = channels // 2
|
422 |
+
self.mean_only = mean_only
|
423 |
+
|
424 |
+
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
425 |
+
self.enc = WN(
|
426 |
+
hidden_channels,
|
427 |
+
kernel_size,
|
428 |
+
dilation_rate,
|
429 |
+
n_layers,
|
430 |
+
p_dropout=p_dropout,
|
431 |
+
gin_channels=gin_channels,
|
432 |
+
)
|
433 |
+
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
434 |
+
self.post.weight.data.zero_()
|
435 |
+
self.post.bias.data.zero_()
|
436 |
+
|
437 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
438 |
+
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
439 |
+
h = self.pre(x0) * x_mask
|
440 |
+
h = self.enc(h, x_mask, g=g)
|
441 |
+
stats = self.post(h) * x_mask
|
442 |
+
if not self.mean_only:
|
443 |
+
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
|
444 |
+
else:
|
445 |
+
m = stats
|
446 |
+
logs = torch.zeros_like(m)
|
447 |
+
|
448 |
+
if not reverse:
|
449 |
+
x1 = m + x1 * torch.exp(logs) * x_mask
|
450 |
+
x = torch.cat([x0, x1], 1)
|
451 |
+
logdet = torch.sum(logs, [1, 2])
|
452 |
+
return x, logdet
|
453 |
+
else:
|
454 |
+
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
455 |
+
x = torch.cat([x0, x1], 1)
|
456 |
+
return x
|
457 |
+
|
458 |
+
|
459 |
+
class ConvFlow(nn.Module):
|
460 |
+
def __init__(
|
461 |
+
self,
|
462 |
+
in_channels,
|
463 |
+
filter_channels,
|
464 |
+
kernel_size,
|
465 |
+
n_layers,
|
466 |
+
num_bins=10,
|
467 |
+
tail_bound=5.0,
|
468 |
+
):
|
469 |
+
super().__init__()
|
470 |
+
self.in_channels = in_channels
|
471 |
+
self.filter_channels = filter_channels
|
472 |
+
self.kernel_size = kernel_size
|
473 |
+
self.n_layers = n_layers
|
474 |
+
self.num_bins = num_bins
|
475 |
+
self.tail_bound = tail_bound
|
476 |
+
self.half_channels = in_channels // 2
|
477 |
+
|
478 |
+
self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
|
479 |
+
self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
|
480 |
+
self.proj = nn.Conv1d(
|
481 |
+
filter_channels, self.half_channels * (num_bins * 3 - 1), 1
|
482 |
+
)
|
483 |
+
self.proj.weight.data.zero_()
|
484 |
+
self.proj.bias.data.zero_()
|
485 |
+
|
486 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
487 |
+
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
488 |
+
h = self.pre(x0)
|
489 |
+
h = self.convs(h, x_mask, g=g)
|
490 |
+
h = self.proj(h) * x_mask
|
491 |
+
|
492 |
+
b, c, t = x0.shape
|
493 |
+
h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
|
494 |
+
|
495 |
+
unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
|
496 |
+
unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
|
497 |
+
self.filter_channels
|
498 |
+
)
|
499 |
+
unnormalized_derivatives = h[..., 2 * self.num_bins :]
|
500 |
+
|
501 |
+
x1, logabsdet = piecewise_rational_quadratic_transform(
|
502 |
+
x1,
|
503 |
+
unnormalized_widths,
|
504 |
+
unnormalized_heights,
|
505 |
+
unnormalized_derivatives,
|
506 |
+
inverse=reverse,
|
507 |
+
tails="linear",
|
508 |
+
tail_bound=self.tail_bound,
|
509 |
+
)
|
510 |
+
|
511 |
+
x = torch.cat([x0, x1], 1) * x_mask
|
512 |
+
logdet = torch.sum(logabsdet * x_mask, [1, 2])
|
513 |
+
if not reverse:
|
514 |
+
return x, logdet
|
515 |
+
else:
|
516 |
+
return x
|
517 |
+
|
518 |
+
|
519 |
+
class TransformerCouplingLayer(nn.Module):
|
520 |
+
def __init__(
|
521 |
+
self,
|
522 |
+
channels,
|
523 |
+
hidden_channels,
|
524 |
+
kernel_size,
|
525 |
+
n_layers,
|
526 |
+
n_heads,
|
527 |
+
p_dropout=0,
|
528 |
+
filter_channels=0,
|
529 |
+
mean_only=False,
|
530 |
+
wn_sharing_parameter=None,
|
531 |
+
gin_channels=0,
|
532 |
+
):
|
533 |
+
assert n_layers == 3, n_layers
|
534 |
+
assert channels % 2 == 0, "channels should be divisible by 2"
|
535 |
+
super().__init__()
|
536 |
+
self.channels = channels
|
537 |
+
self.hidden_channels = hidden_channels
|
538 |
+
self.kernel_size = kernel_size
|
539 |
+
self.n_layers = n_layers
|
540 |
+
self.half_channels = channels // 2
|
541 |
+
self.mean_only = mean_only
|
542 |
+
|
543 |
+
self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
|
544 |
+
self.enc = (
|
545 |
+
Encoder(
|
546 |
+
hidden_channels,
|
547 |
+
filter_channels,
|
548 |
+
n_heads,
|
549 |
+
n_layers,
|
550 |
+
kernel_size,
|
551 |
+
p_dropout,
|
552 |
+
isflow=True,
|
553 |
+
gin_channels=gin_channels,
|
554 |
+
)
|
555 |
+
if wn_sharing_parameter is None
|
556 |
+
else wn_sharing_parameter
|
557 |
+
)
|
558 |
+
self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
|
559 |
+
self.post.weight.data.zero_()
|
560 |
+
self.post.bias.data.zero_()
|
561 |
+
|
562 |
+
def forward(self, x, x_mask, g=None, reverse=False):
|
563 |
+
x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
|
564 |
+
h = self.pre(x0) * x_mask
|
565 |
+
h = self.enc(h, x_mask, g=g)
|
566 |
+
stats = self.post(h) * x_mask
|
567 |
+
if not self.mean_only:
|
568 |
+
m, logs = torch.split(stats, [self.half_channels] * 2, 1)
|
569 |
+
else:
|
570 |
+
m = stats
|
571 |
+
logs = torch.zeros_like(m)
|
572 |
+
|
573 |
+
if not reverse:
|
574 |
+
x1 = m + x1 * torch.exp(logs) * x_mask
|
575 |
+
x = torch.cat([x0, x1], 1)
|
576 |
+
logdet = torch.sum(logs, [1, 2])
|
577 |
+
return x, logdet
|
578 |
+
else:
|
579 |
+
x1 = (x1 - m) * torch.exp(-logs) * x_mask
|
580 |
+
x = torch.cat([x0, x1], 1)
|
581 |
+
return x
|
582 |
+
|
583 |
+
x1, logabsdet = piecewise_rational_quadratic_transform(
|
584 |
+
x1,
|
585 |
+
unnormalized_widths,
|
586 |
+
unnormalized_heights,
|
587 |
+
unnormalized_derivatives,
|
588 |
+
inverse=reverse,
|
589 |
+
tails="linear",
|
590 |
+
tail_bound=self.tail_bound,
|
591 |
+
)
|
592 |
+
|
593 |
+
x = torch.cat([x0, x1], 1) * x_mask
|
594 |
+
logdet = torch.sum(logabsdet * x_mask, [1, 2])
|
595 |
+
if not reverse:
|
596 |
+
return x, logdet
|
597 |
+
else:
|
598 |
+
return x
|
dreamvoice/train_utils/src/openvoice/openvoice_app.py
ADDED
@@ -0,0 +1,275 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import torch
|
3 |
+
import argparse
|
4 |
+
import gradio as gr
|
5 |
+
from zipfile import ZipFile
|
6 |
+
import langid
|
7 |
+
from openvoice import se_extractor
|
8 |
+
from openvoice.api import BaseSpeakerTTS, ToneColorConverter
|
9 |
+
|
10 |
+
parser = argparse.ArgumentParser()
|
11 |
+
parser.add_argument("--share", action='store_true', default=False, help="make link public")
|
12 |
+
args = parser.parse_args()
|
13 |
+
|
14 |
+
en_ckpt_base = 'checkpoints/base_speakers/EN'
|
15 |
+
zh_ckpt_base = 'checkpoints/base_speakers/ZH'
|
16 |
+
ckpt_converter = 'checkpoints/converter'
|
17 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
18 |
+
output_dir = 'outputs'
|
19 |
+
os.makedirs(output_dir, exist_ok=True)
|
20 |
+
|
21 |
+
# load models
|
22 |
+
en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
|
23 |
+
en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
|
24 |
+
zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
|
25 |
+
zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
|
26 |
+
tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
|
27 |
+
tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
|
28 |
+
|
29 |
+
# load speaker embeddings
|
30 |
+
en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
|
31 |
+
en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
|
32 |
+
zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
|
33 |
+
|
34 |
+
# This online demo mainly supports English and Chinese
|
35 |
+
supported_languages = ['zh', 'en']
|
36 |
+
|
37 |
+
def predict(prompt, style, audio_file_pth, agree):
|
38 |
+
# initialize a empty info
|
39 |
+
text_hint = ''
|
40 |
+
# agree with the terms
|
41 |
+
if agree == False:
|
42 |
+
text_hint += '[ERROR] Please accept the Terms & Condition!\n'
|
43 |
+
gr.Warning("Please accept the Terms & Condition!")
|
44 |
+
return (
|
45 |
+
text_hint,
|
46 |
+
None,
|
47 |
+
None,
|
48 |
+
)
|
49 |
+
|
50 |
+
# first detect the input language
|
51 |
+
language_predicted = langid.classify(prompt)[0].strip()
|
52 |
+
print(f"Detected language:{language_predicted}")
|
53 |
+
|
54 |
+
if language_predicted not in supported_languages:
|
55 |
+
text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
|
56 |
+
gr.Warning(
|
57 |
+
f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
|
58 |
+
)
|
59 |
+
|
60 |
+
return (
|
61 |
+
text_hint,
|
62 |
+
None,
|
63 |
+
None,
|
64 |
+
)
|
65 |
+
|
66 |
+
if language_predicted == "zh":
|
67 |
+
tts_model = zh_base_speaker_tts
|
68 |
+
source_se = zh_source_se
|
69 |
+
language = 'Chinese'
|
70 |
+
if style not in ['default']:
|
71 |
+
text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
|
72 |
+
gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
|
73 |
+
return (
|
74 |
+
text_hint,
|
75 |
+
None,
|
76 |
+
None,
|
77 |
+
)
|
78 |
+
|
79 |
+
else:
|
80 |
+
tts_model = en_base_speaker_tts
|
81 |
+
if style == 'default':
|
82 |
+
source_se = en_source_default_se
|
83 |
+
else:
|
84 |
+
source_se = en_source_style_se
|
85 |
+
language = 'English'
|
86 |
+
if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
|
87 |
+
text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
|
88 |
+
gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
|
89 |
+
return (
|
90 |
+
text_hint,
|
91 |
+
None,
|
92 |
+
None,
|
93 |
+
)
|
94 |
+
|
95 |
+
speaker_wav = audio_file_pth
|
96 |
+
|
97 |
+
if len(prompt) < 2:
|
98 |
+
text_hint += f"[ERROR] Please give a longer prompt text \n"
|
99 |
+
gr.Warning("Please give a longer prompt text")
|
100 |
+
return (
|
101 |
+
text_hint,
|
102 |
+
None,
|
103 |
+
None,
|
104 |
+
)
|
105 |
+
if len(prompt) > 200:
|
106 |
+
text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
|
107 |
+
gr.Warning(
|
108 |
+
"Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage"
|
109 |
+
)
|
110 |
+
return (
|
111 |
+
text_hint,
|
112 |
+
None,
|
113 |
+
None,
|
114 |
+
)
|
115 |
+
|
116 |
+
# note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
|
117 |
+
try:
|
118 |
+
target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', vad=True)
|
119 |
+
except Exception as e:
|
120 |
+
text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
|
121 |
+
gr.Warning(
|
122 |
+
"[ERROR] Get target tone color error {str(e)} \n"
|
123 |
+
)
|
124 |
+
return (
|
125 |
+
text_hint,
|
126 |
+
None,
|
127 |
+
None,
|
128 |
+
)
|
129 |
+
|
130 |
+
src_path = f'{output_dir}/tmp.wav'
|
131 |
+
tts_model.tts(prompt, src_path, speaker=style, language=language)
|
132 |
+
|
133 |
+
save_path = f'{output_dir}/output.wav'
|
134 |
+
# Run the tone color converter
|
135 |
+
encode_message = "@MyShell"
|
136 |
+
tone_color_converter.convert(
|
137 |
+
audio_src_path=src_path,
|
138 |
+
src_se=source_se,
|
139 |
+
tgt_se=target_se,
|
140 |
+
output_path=save_path,
|
141 |
+
message=encode_message)
|
142 |
+
|
143 |
+
text_hint += f'''Get response successfully \n'''
|
144 |
+
|
145 |
+
return (
|
146 |
+
text_hint,
|
147 |
+
save_path,
|
148 |
+
speaker_wav,
|
149 |
+
)
|
150 |
+
|
151 |
+
|
152 |
+
|
153 |
+
title = "MyShell OpenVoice"
|
154 |
+
|
155 |
+
description = """
|
156 |
+
We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
|
157 |
+
"""
|
158 |
+
|
159 |
+
markdown_table = """
|
160 |
+
<div align="center" style="margin-bottom: 10px;">
|
161 |
+
|
162 |
+
| | | |
|
163 |
+
| :-----------: | :-----------: | :-----------: |
|
164 |
+
| **OpenSource Repo** | **Project Page** | **Join the Community** |
|
165 |
+
| <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
|
166 |
+
|
167 |
+
</div>
|
168 |
+
"""
|
169 |
+
|
170 |
+
markdown_table_v2 = """
|
171 |
+
<div align="center" style="margin-bottom: 2px;">
|
172 |
+
|
173 |
+
| | | | |
|
174 |
+
| :-----------: | :-----------: | :-----------: | :-----------: |
|
175 |
+
| **OpenSource Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
|
176 |
+
|
177 |
+
| | |
|
178 |
+
| :-----------: | :-----------: |
|
179 |
+
**Join the Community** | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
|
180 |
+
|
181 |
+
</div>
|
182 |
+
"""
|
183 |
+
content = """
|
184 |
+
<div>
|
185 |
+
<strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
|
186 |
+
This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
|
187 |
+
</div>
|
188 |
+
"""
|
189 |
+
wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
|
190 |
+
|
191 |
+
|
192 |
+
examples = [
|
193 |
+
[
|
194 |
+
"今天天气真好,我们一起出去吃饭吧。",
|
195 |
+
'default',
|
196 |
+
"resources/demo_speaker1.mp3",
|
197 |
+
True,
|
198 |
+
],[
|
199 |
+
"This audio is generated by open voice with a half-performance model.",
|
200 |
+
'whispering',
|
201 |
+
"resources/demo_speaker2.mp3",
|
202 |
+
True,
|
203 |
+
],
|
204 |
+
[
|
205 |
+
"He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
206 |
+
'sad',
|
207 |
+
"resources/demo_speaker0.mp3",
|
208 |
+
True,
|
209 |
+
],
|
210 |
+
]
|
211 |
+
|
212 |
+
with gr.Blocks(analytics_enabled=False) as demo:
|
213 |
+
|
214 |
+
with gr.Row():
|
215 |
+
with gr.Column():
|
216 |
+
with gr.Row():
|
217 |
+
gr.Markdown(
|
218 |
+
"""
|
219 |
+
## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
|
220 |
+
"""
|
221 |
+
)
|
222 |
+
with gr.Row():
|
223 |
+
gr.Markdown(markdown_table_v2)
|
224 |
+
with gr.Row():
|
225 |
+
gr.Markdown(description)
|
226 |
+
with gr.Column():
|
227 |
+
gr.Video('https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f', autoplay=True)
|
228 |
+
|
229 |
+
with gr.Row():
|
230 |
+
gr.HTML(wrapped_markdown_content)
|
231 |
+
|
232 |
+
with gr.Row():
|
233 |
+
with gr.Column():
|
234 |
+
input_text_gr = gr.Textbox(
|
235 |
+
label="Text Prompt",
|
236 |
+
info="One or two sentences at a time is better. Up to 200 text characters.",
|
237 |
+
value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
|
238 |
+
)
|
239 |
+
style_gr = gr.Dropdown(
|
240 |
+
label="Style",
|
241 |
+
info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
|
242 |
+
choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
|
243 |
+
max_choices=1,
|
244 |
+
value="default",
|
245 |
+
)
|
246 |
+
ref_gr = gr.Audio(
|
247 |
+
label="Reference Audio",
|
248 |
+
info="Click on the ✎ button to upload your own target speaker audio",
|
249 |
+
type="filepath",
|
250 |
+
value="resources/demo_speaker2.mp3",
|
251 |
+
)
|
252 |
+
tos_gr = gr.Checkbox(
|
253 |
+
label="Agree",
|
254 |
+
value=False,
|
255 |
+
info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
|
256 |
+
)
|
257 |
+
|
258 |
+
tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
|
259 |
+
|
260 |
+
|
261 |
+
with gr.Column():
|
262 |
+
out_text_gr = gr.Text(label="Info")
|
263 |
+
audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
|
264 |
+
ref_audio_gr = gr.Audio(label="Reference Audio Used")
|
265 |
+
|
266 |
+
gr.Examples(examples,
|
267 |
+
label="Examples",
|
268 |
+
inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
|
269 |
+
outputs=[out_text_gr, audio_gr, ref_audio_gr],
|
270 |
+
fn=predict,
|
271 |
+
cache_examples=False,)
|
272 |
+
tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
|
273 |
+
|
274 |
+
demo.queue()
|
275 |
+
demo.launch(debug=True, show_api=True, share=args.share)
|
dreamvoice/train_utils/src/openvoice/se_extractor.py
ADDED
@@ -0,0 +1,153 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import glob
|
3 |
+
import torch
|
4 |
+
import hashlib
|
5 |
+
import librosa
|
6 |
+
import base64
|
7 |
+
from glob import glob
|
8 |
+
import numpy as np
|
9 |
+
from pydub import AudioSegment
|
10 |
+
from faster_whisper import WhisperModel
|
11 |
+
import hashlib
|
12 |
+
import base64
|
13 |
+
import librosa
|
14 |
+
from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
|
15 |
+
|
16 |
+
model_size = "medium"
|
17 |
+
# Run on GPU with FP16
|
18 |
+
model = None
|
19 |
+
def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
|
20 |
+
global model
|
21 |
+
if model is None:
|
22 |
+
model = WhisperModel(model_size, device="cuda", compute_type="float16")
|
23 |
+
audio = AudioSegment.from_file(audio_path)
|
24 |
+
max_len = len(audio)
|
25 |
+
|
26 |
+
target_folder = os.path.join(target_dir, audio_name)
|
27 |
+
|
28 |
+
segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
|
29 |
+
segments = list(segments)
|
30 |
+
|
31 |
+
# create directory
|
32 |
+
os.makedirs(target_folder, exist_ok=True)
|
33 |
+
wavs_folder = os.path.join(target_folder, 'wavs')
|
34 |
+
os.makedirs(wavs_folder, exist_ok=True)
|
35 |
+
|
36 |
+
# segments
|
37 |
+
s_ind = 0
|
38 |
+
start_time = None
|
39 |
+
|
40 |
+
for k, w in enumerate(segments):
|
41 |
+
# process with the time
|
42 |
+
if k == 0:
|
43 |
+
start_time = max(0, w.start)
|
44 |
+
|
45 |
+
end_time = w.end
|
46 |
+
|
47 |
+
# calculate confidence
|
48 |
+
if len(w.words) > 0:
|
49 |
+
confidence = sum([s.probability for s in w.words]) / len(w.words)
|
50 |
+
else:
|
51 |
+
confidence = 0.
|
52 |
+
# clean text
|
53 |
+
text = w.text.replace('...', '')
|
54 |
+
|
55 |
+
# left 0.08s for each audios
|
56 |
+
audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
|
57 |
+
|
58 |
+
# segment file name
|
59 |
+
fname = f"{audio_name}_seg{s_ind}.wav"
|
60 |
+
|
61 |
+
# filter out the segment shorter than 1.5s and longer than 20s
|
62 |
+
save = audio_seg.duration_seconds > 1.5 and \
|
63 |
+
audio_seg.duration_seconds < 20. and \
|
64 |
+
len(text) >= 2 and len(text) < 200
|
65 |
+
|
66 |
+
if save:
|
67 |
+
output_file = os.path.join(wavs_folder, fname)
|
68 |
+
audio_seg.export(output_file, format='wav')
|
69 |
+
|
70 |
+
if k < len(segments) - 1:
|
71 |
+
start_time = max(0, segments[k+1].start - 0.08)
|
72 |
+
|
73 |
+
s_ind = s_ind + 1
|
74 |
+
return wavs_folder
|
75 |
+
|
76 |
+
|
77 |
+
def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
|
78 |
+
SAMPLE_RATE = 16000
|
79 |
+
audio_vad = get_audio_tensor(audio_path)
|
80 |
+
segments = get_vad_segments(
|
81 |
+
audio_vad,
|
82 |
+
output_sample=True,
|
83 |
+
min_speech_duration=0.1,
|
84 |
+
min_silence_duration=1,
|
85 |
+
method="silero",
|
86 |
+
)
|
87 |
+
segments = [(seg["start"], seg["end"]) for seg in segments]
|
88 |
+
segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
|
89 |
+
print(segments)
|
90 |
+
audio_active = AudioSegment.silent(duration=0)
|
91 |
+
audio = AudioSegment.from_file(audio_path)
|
92 |
+
|
93 |
+
for start_time, end_time in segments:
|
94 |
+
audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
|
95 |
+
|
96 |
+
audio_dur = audio_active.duration_seconds
|
97 |
+
print(f'after vad: dur = {audio_dur}')
|
98 |
+
target_folder = os.path.join(target_dir, audio_name)
|
99 |
+
wavs_folder = os.path.join(target_folder, 'wavs')
|
100 |
+
os.makedirs(wavs_folder, exist_ok=True)
|
101 |
+
start_time = 0.
|
102 |
+
count = 0
|
103 |
+
num_splits = int(np.round(audio_dur / split_seconds))
|
104 |
+
assert num_splits > 0, 'input audio is too short'
|
105 |
+
interval = audio_dur / num_splits
|
106 |
+
|
107 |
+
for i in range(num_splits):
|
108 |
+
end_time = min(start_time + interval, audio_dur)
|
109 |
+
if i == num_splits - 1:
|
110 |
+
end_time = audio_dur
|
111 |
+
output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
|
112 |
+
audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
|
113 |
+
audio_seg.export(output_file, format='wav')
|
114 |
+
start_time = end_time
|
115 |
+
count += 1
|
116 |
+
return wavs_folder
|
117 |
+
|
118 |
+
def hash_numpy_array(audio_path):
|
119 |
+
array, _ = librosa.load(audio_path, sr=None, mono=True)
|
120 |
+
# Convert the array to bytes
|
121 |
+
array_bytes = array.tobytes()
|
122 |
+
# Calculate the hash of the array bytes
|
123 |
+
hash_object = hashlib.sha256(array_bytes)
|
124 |
+
hash_value = hash_object.digest()
|
125 |
+
# Convert the hash value to base64
|
126 |
+
base64_value = base64.b64encode(hash_value)
|
127 |
+
return base64_value.decode('utf-8')[:16].replace('/', '_^')
|
128 |
+
|
129 |
+
def get_se(audio_path, vc_model, target_dir='processed', vad=True):
|
130 |
+
device = vc_model.device
|
131 |
+
version = vc_model.version
|
132 |
+
print("OpenVoice version:", version)
|
133 |
+
|
134 |
+
audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
|
135 |
+
se_path = os.path.join(target_dir, audio_name, 'se.pth')
|
136 |
+
|
137 |
+
# if os.path.isfile(se_path):
|
138 |
+
# se = torch.load(se_path).to(device)
|
139 |
+
# return se, audio_name
|
140 |
+
# if os.path.isdir(audio_path):
|
141 |
+
# wavs_folder = audio_path
|
142 |
+
|
143 |
+
if vad:
|
144 |
+
wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
|
145 |
+
else:
|
146 |
+
wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
|
147 |
+
|
148 |
+
audio_segs = glob(f'{wavs_folder}/*.wav')
|
149 |
+
if len(audio_segs) == 0:
|
150 |
+
raise NotImplementedError('No audio segments found!')
|
151 |
+
|
152 |
+
return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name
|
153 |
+
|
dreamvoice/train_utils/src/openvoice/text/__init__.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
from openvoice.text import cleaners
|
3 |
+
from openvoice.text.symbols import symbols
|
4 |
+
|
5 |
+
|
6 |
+
# Mappings from symbol to numeric ID and vice versa:
|
7 |
+
_symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
8 |
+
_id_to_symbol = {i: s for i, s in enumerate(symbols)}
|
9 |
+
|
10 |
+
|
11 |
+
def text_to_sequence(text, symbols, cleaner_names):
|
12 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
13 |
+
Args:
|
14 |
+
text: string to convert to a sequence
|
15 |
+
cleaner_names: names of the cleaner functions to run the text through
|
16 |
+
Returns:
|
17 |
+
List of integers corresponding to the symbols in the text
|
18 |
+
'''
|
19 |
+
sequence = []
|
20 |
+
symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
21 |
+
clean_text = _clean_text(text, cleaner_names)
|
22 |
+
print(clean_text)
|
23 |
+
print(f" length:{len(clean_text)}")
|
24 |
+
for symbol in clean_text:
|
25 |
+
if symbol not in symbol_to_id.keys():
|
26 |
+
continue
|
27 |
+
symbol_id = symbol_to_id[symbol]
|
28 |
+
sequence += [symbol_id]
|
29 |
+
print(f" length:{len(sequence)}")
|
30 |
+
return sequence
|
31 |
+
|
32 |
+
|
33 |
+
def cleaned_text_to_sequence(cleaned_text, symbols):
|
34 |
+
'''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
35 |
+
Args:
|
36 |
+
text: string to convert to a sequence
|
37 |
+
Returns:
|
38 |
+
List of integers corresponding to the symbols in the text
|
39 |
+
'''
|
40 |
+
symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
41 |
+
sequence = [symbol_to_id[symbol] for symbol in cleaned_text if symbol in symbol_to_id.keys()]
|
42 |
+
return sequence
|
43 |
+
|
44 |
+
|
45 |
+
|
46 |
+
from openvoice.text.symbols import language_tone_start_map
|
47 |
+
def cleaned_text_to_sequence_vits2(cleaned_text, tones, language, symbols, languages):
|
48 |
+
"""Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
|
49 |
+
Args:
|
50 |
+
text: string to convert to a sequence
|
51 |
+
Returns:
|
52 |
+
List of integers corresponding to the symbols in the text
|
53 |
+
"""
|
54 |
+
symbol_to_id = {s: i for i, s in enumerate(symbols)}
|
55 |
+
language_id_map = {s: i for i, s in enumerate(languages)}
|
56 |
+
phones = [symbol_to_id[symbol] for symbol in cleaned_text]
|
57 |
+
tone_start = language_tone_start_map[language]
|
58 |
+
tones = [i + tone_start for i in tones]
|
59 |
+
lang_id = language_id_map[language]
|
60 |
+
lang_ids = [lang_id for i in phones]
|
61 |
+
return phones, tones, lang_ids
|
62 |
+
|
63 |
+
|
64 |
+
def sequence_to_text(sequence):
|
65 |
+
'''Converts a sequence of IDs back to a string'''
|
66 |
+
result = ''
|
67 |
+
for symbol_id in sequence:
|
68 |
+
s = _id_to_symbol[symbol_id]
|
69 |
+
result += s
|
70 |
+
return result
|
71 |
+
|
72 |
+
|
73 |
+
def _clean_text(text, cleaner_names):
|
74 |
+
for name in cleaner_names:
|
75 |
+
cleaner = getattr(cleaners, name)
|
76 |
+
if not cleaner:
|
77 |
+
raise Exception('Unknown cleaner: %s' % name)
|
78 |
+
text = cleaner(text)
|
79 |
+
return text
|
dreamvoice/train_utils/src/openvoice/text/cleaners.py
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
from openvoice.text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
|
3 |
+
from openvoice.text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
|
4 |
+
|
5 |
+
def cjke_cleaners2(text):
|
6 |
+
text = re.sub(r'\[ZH\](.*?)\[ZH\]',
|
7 |
+
lambda x: chinese_to_ipa(x.group(1))+' ', text)
|
8 |
+
text = re.sub(r'\[JA\](.*?)\[JA\]',
|
9 |
+
lambda x: japanese_to_ipa2(x.group(1))+' ', text)
|
10 |
+
text = re.sub(r'\[KO\](.*?)\[KO\]',
|
11 |
+
lambda x: korean_to_ipa(x.group(1))+' ', text)
|
12 |
+
text = re.sub(r'\[EN\](.*?)\[EN\]',
|
13 |
+
lambda x: english_to_ipa2(x.group(1))+' ', text)
|
14 |
+
text = re.sub(r'\s+$', '', text)
|
15 |
+
text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
|
16 |
+
return text
|
dreamvoice/train_utils/src/openvoice/text/english.py
ADDED
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
""" from https://github.com/keithito/tacotron """
|
2 |
+
|
3 |
+
'''
|
4 |
+
Cleaners are transformations that run over the input text at both training and eval time.
|
5 |
+
|
6 |
+
Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
|
7 |
+
hyperparameter. Some cleaners are English-specific. You'll typically want to use:
|
8 |
+
1. "english_cleaners" for English text
|
9 |
+
2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
|
10 |
+
the Unidecode library (https://pypi.python.org/pypi/Unidecode)
|
11 |
+
3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
|
12 |
+
the symbols in symbols.py to match your data).
|
13 |
+
'''
|
14 |
+
|
15 |
+
|
16 |
+
# Regular expression matching whitespace:
|
17 |
+
|
18 |
+
|
19 |
+
import re
|
20 |
+
import inflect
|
21 |
+
from unidecode import unidecode
|
22 |
+
import eng_to_ipa as ipa
|
23 |
+
_inflect = inflect.engine()
|
24 |
+
_comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
|
25 |
+
_decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
|
26 |
+
_pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
|
27 |
+
_dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
|
28 |
+
_ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
|
29 |
+
_number_re = re.compile(r'[0-9]+')
|
30 |
+
|
31 |
+
# List of (regular expression, replacement) pairs for abbreviations:
|
32 |
+
_abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
|
33 |
+
('mrs', 'misess'),
|
34 |
+
('mr', 'mister'),
|
35 |
+
('dr', 'doctor'),
|
36 |
+
('st', 'saint'),
|
37 |
+
('co', 'company'),
|
38 |
+
('jr', 'junior'),
|
39 |
+
('maj', 'major'),
|
40 |
+
('gen', 'general'),
|
41 |
+
('drs', 'doctors'),
|
42 |
+
('rev', 'reverend'),
|
43 |
+
('lt', 'lieutenant'),
|
44 |
+
('hon', 'honorable'),
|
45 |
+
('sgt', 'sergeant'),
|
46 |
+
('capt', 'captain'),
|
47 |
+
('esq', 'esquire'),
|
48 |
+
('ltd', 'limited'),
|
49 |
+
('col', 'colonel'),
|
50 |
+
('ft', 'fort'),
|
51 |
+
]]
|
52 |
+
|
53 |
+
|
54 |
+
# List of (ipa, lazy ipa) pairs:
|
55 |
+
_lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
56 |
+
('r', 'ɹ'),
|
57 |
+
('æ', 'e'),
|
58 |
+
('ɑ', 'a'),
|
59 |
+
('ɔ', 'o'),
|
60 |
+
('ð', 'z'),
|
61 |
+
('θ', 's'),
|
62 |
+
('ɛ', 'e'),
|
63 |
+
('ɪ', 'i'),
|
64 |
+
('ʊ', 'u'),
|
65 |
+
('ʒ', 'ʥ'),
|
66 |
+
('ʤ', 'ʥ'),
|
67 |
+
('ˈ', '↓'),
|
68 |
+
]]
|
69 |
+
|
70 |
+
# List of (ipa, lazy ipa2) pairs:
|
71 |
+
_lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
|
72 |
+
('r', 'ɹ'),
|
73 |
+
('ð', 'z'),
|
74 |
+
('θ', 's'),
|
75 |
+
('ʒ', 'ʑ'),
|
76 |
+
('ʤ', 'dʑ'),
|
77 |
+
('ˈ', '↓'),
|
78 |
+
]]
|
79 |
+
|
80 |
+
# List of (ipa, ipa2) pairs
|
81 |
+
_ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
|
82 |
+
('r', 'ɹ'),
|
83 |
+
('ʤ', 'dʒ'),
|
84 |
+
('ʧ', 'tʃ')
|
85 |
+
]]
|
86 |
+
|
87 |
+
|
88 |
+
def expand_abbreviations(text):
|
89 |
+
for regex, replacement in _abbreviations:
|
90 |
+
text = re.sub(regex, replacement, text)
|
91 |
+
return text
|
92 |
+
|
93 |
+
|
94 |
+
def collapse_whitespace(text):
|
95 |
+
return re.sub(r'\s+', ' ', text)
|
96 |
+
|
97 |
+
|
98 |
+
def _remove_commas(m):
|
99 |
+
return m.group(1).replace(',', '')
|
100 |
+
|
101 |
+
|
102 |
+
def _expand_decimal_point(m):
|
103 |
+
return m.group(1).replace('.', ' point ')
|
104 |
+
|
105 |
+
|
106 |
+
def _expand_dollars(m):
|
107 |
+
match = m.group(1)
|
108 |
+
parts = match.split('.')
|
109 |
+
if len(parts) > 2:
|
110 |
+
return match + ' dollars' # Unexpected format
|
111 |
+
dollars = int(parts[0]) if parts[0] else 0
|
112 |
+
cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
|
113 |
+
if dollars and cents:
|
114 |
+
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
115 |
+
cent_unit = 'cent' if cents == 1 else 'cents'
|
116 |
+
return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
|
117 |
+
elif dollars:
|
118 |
+
dollar_unit = 'dollar' if dollars == 1 else 'dollars'
|
119 |
+
return '%s %s' % (dollars, dollar_unit)
|
120 |
+
elif cents:
|
121 |
+
cent_unit = 'cent' if cents == 1 else 'cents'
|
122 |
+
return '%s %s' % (cents, cent_unit)
|
123 |
+
else:
|
124 |
+
return 'zero dollars'
|
125 |
+
|
126 |
+
|
127 |
+
def _expand_ordinal(m):
|
128 |
+
return _inflect.number_to_words(m.group(0))
|
129 |
+
|
130 |
+
|
131 |
+
def _expand_number(m):
|
132 |
+
num = int(m.group(0))
|
133 |
+
if num > 1000 and num < 3000:
|
134 |
+
if num == 2000:
|
135 |
+
return 'two thousand'
|
136 |
+
elif num > 2000 and num < 2010:
|
137 |
+
return 'two thousand ' + _inflect.number_to_words(num % 100)
|
138 |
+
elif num % 100 == 0:
|
139 |
+
return _inflect.number_to_words(num // 100) + ' hundred'
|
140 |
+
else:
|
141 |
+
return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
|
142 |
+
else:
|
143 |
+
return _inflect.number_to_words(num, andword='')
|
144 |
+
|
145 |
+
|
146 |
+
def normalize_numbers(text):
|
147 |
+
text = re.sub(_comma_number_re, _remove_commas, text)
|
148 |
+
text = re.sub(_pounds_re, r'\1 pounds', text)
|
149 |
+
text = re.sub(_dollars_re, _expand_dollars, text)
|
150 |
+
text = re.sub(_decimal_number_re, _expand_decimal_point, text)
|
151 |
+
text = re.sub(_ordinal_re, _expand_ordinal, text)
|
152 |
+
text = re.sub(_number_re, _expand_number, text)
|
153 |
+
return text
|
154 |
+
|
155 |
+
|
156 |
+
def mark_dark_l(text):
|
157 |
+
return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
|
158 |
+
|
159 |
+
|
160 |
+
def english_to_ipa(text):
|
161 |
+
text = unidecode(text).lower()
|
162 |
+
text = expand_abbreviations(text)
|
163 |
+
text = normalize_numbers(text)
|
164 |
+
phonemes = ipa.convert(text)
|
165 |
+
phonemes = collapse_whitespace(phonemes)
|
166 |
+
return phonemes
|
167 |
+
|
168 |
+
|
169 |
+
def english_to_lazy_ipa(text):
|
170 |
+
text = english_to_ipa(text)
|
171 |
+
for regex, replacement in _lazy_ipa:
|
172 |
+
text = re.sub(regex, replacement, text)
|
173 |
+
return text
|
174 |
+
|
175 |
+
|
176 |
+
def english_to_ipa2(text):
|
177 |
+
text = english_to_ipa(text)
|
178 |
+
text = mark_dark_l(text)
|
179 |
+
for regex, replacement in _ipa_to_ipa2:
|
180 |
+
text = re.sub(regex, replacement, text)
|
181 |
+
return text.replace('...', '…')
|
182 |
+
|
183 |
+
|
184 |
+
def english_to_lazy_ipa2(text):
|
185 |
+
text = english_to_ipa(text)
|
186 |
+
for regex, replacement in _lazy_ipa2:
|
187 |
+
text = re.sub(regex, replacement, text)
|
188 |
+
return text
|
dreamvoice/train_utils/src/openvoice/text/mandarin.py
ADDED
@@ -0,0 +1,326 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import re
|
4 |
+
from pypinyin import lazy_pinyin, BOPOMOFO
|
5 |
+
import jieba
|
6 |
+
import cn2an
|
7 |
+
import logging
|
8 |
+
|
9 |
+
|
10 |
+
# List of (Latin alphabet, bopomofo) pairs:
|
11 |
+
_latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
12 |
+
('a', 'ㄟˉ'),
|
13 |
+
('b', 'ㄅㄧˋ'),
|
14 |
+
('c', 'ㄙㄧˉ'),
|
15 |
+
('d', 'ㄉㄧˋ'),
|
16 |
+
('e', 'ㄧˋ'),
|
17 |
+
('f', 'ㄝˊㄈㄨˋ'),
|
18 |
+
('g', 'ㄐㄧˋ'),
|
19 |
+
('h', 'ㄝˇㄑㄩˋ'),
|
20 |
+
('i', 'ㄞˋ'),
|
21 |
+
('j', 'ㄐㄟˋ'),
|
22 |
+
('k', 'ㄎㄟˋ'),
|
23 |
+
('l', 'ㄝˊㄛˋ'),
|
24 |
+
('m', 'ㄝˊㄇㄨˋ'),
|
25 |
+
('n', 'ㄣˉ'),
|
26 |
+
('o', 'ㄡˉ'),
|
27 |
+
('p', 'ㄆㄧˉ'),
|
28 |
+
('q', 'ㄎㄧㄡˉ'),
|
29 |
+
('r', 'ㄚˋ'),
|
30 |
+
('s', 'ㄝˊㄙˋ'),
|
31 |
+
('t', 'ㄊㄧˋ'),
|
32 |
+
('u', 'ㄧㄡˉ'),
|
33 |
+
('v', 'ㄨㄧˉ'),
|
34 |
+
('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
|
35 |
+
('x', 'ㄝˉㄎㄨˋㄙˋ'),
|
36 |
+
('y', 'ㄨㄞˋ'),
|
37 |
+
('z', 'ㄗㄟˋ')
|
38 |
+
]]
|
39 |
+
|
40 |
+
# List of (bopomofo, romaji) pairs:
|
41 |
+
_bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
|
42 |
+
('ㄅㄛ', 'p⁼wo'),
|
43 |
+
('ㄆㄛ', 'pʰwo'),
|
44 |
+
('ㄇㄛ', 'mwo'),
|
45 |
+
('ㄈㄛ', 'fwo'),
|
46 |
+
('ㄅ', 'p⁼'),
|
47 |
+
('ㄆ', 'pʰ'),
|
48 |
+
('ㄇ', 'm'),
|
49 |
+
('ㄈ', 'f'),
|
50 |
+
('ㄉ', 't⁼'),
|
51 |
+
('ㄊ', 'tʰ'),
|
52 |
+
('ㄋ', 'n'),
|
53 |
+
('ㄌ', 'l'),
|
54 |
+
('ㄍ', 'k⁼'),
|
55 |
+
('ㄎ', 'kʰ'),
|
56 |
+
('ㄏ', 'h'),
|
57 |
+
('ㄐ', 'ʧ⁼'),
|
58 |
+
('ㄑ', 'ʧʰ'),
|
59 |
+
('ㄒ', 'ʃ'),
|
60 |
+
('ㄓ', 'ʦ`⁼'),
|
61 |
+
('ㄔ', 'ʦ`ʰ'),
|
62 |
+
('ㄕ', 's`'),
|
63 |
+
('ㄖ', 'ɹ`'),
|
64 |
+
('ㄗ', 'ʦ⁼'),
|
65 |
+
('ㄘ', 'ʦʰ'),
|
66 |
+
('ㄙ', 's'),
|
67 |
+
('ㄚ', 'a'),
|
68 |
+
('ㄛ', 'o'),
|
69 |
+
('ㄜ', 'ə'),
|
70 |
+
('ㄝ', 'e'),
|
71 |
+
('ㄞ', 'ai'),
|
72 |
+
('ㄟ', 'ei'),
|
73 |
+
('ㄠ', 'au'),
|
74 |
+
('ㄡ', 'ou'),
|
75 |
+
('ㄧㄢ', 'yeNN'),
|
76 |
+
('ㄢ', 'aNN'),
|
77 |
+
('ㄧㄣ', 'iNN'),
|
78 |
+
('ㄣ', 'əNN'),
|
79 |
+
('ㄤ', 'aNg'),
|
80 |
+
('ㄧㄥ', 'iNg'),
|
81 |
+
('ㄨㄥ', 'uNg'),
|
82 |
+
('ㄩㄥ', 'yuNg'),
|
83 |
+
('ㄥ', 'əNg'),
|
84 |
+
('ㄦ', 'əɻ'),
|
85 |
+
('ㄧ', 'i'),
|
86 |
+
('ㄨ', 'u'),
|
87 |
+
('ㄩ', 'ɥ'),
|
88 |
+
('ˉ', '→'),
|
89 |
+
('ˊ', '↑'),
|
90 |
+
('ˇ', '↓↑'),
|
91 |
+
('ˋ', '↓'),
|
92 |
+
('˙', ''),
|
93 |
+
(',', ','),
|
94 |
+
('。', '.'),
|
95 |
+
('!', '!'),
|
96 |
+
('?', '?'),
|
97 |
+
('—', '-')
|
98 |
+
]]
|
99 |
+
|
100 |
+
# List of (romaji, ipa) pairs:
|
101 |
+
_romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
|
102 |
+
('ʃy', 'ʃ'),
|
103 |
+
('ʧʰy', 'ʧʰ'),
|
104 |
+
('ʧ⁼y', 'ʧ⁼'),
|
105 |
+
('NN', 'n'),
|
106 |
+
('Ng', 'ŋ'),
|
107 |
+
('y', 'j'),
|
108 |
+
('h', 'x')
|
109 |
+
]]
|
110 |
+
|
111 |
+
# List of (bopomofo, ipa) pairs:
|
112 |
+
_bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
|
113 |
+
('ㄅㄛ', 'p⁼wo'),
|
114 |
+
('ㄆㄛ', 'pʰwo'),
|
115 |
+
('ㄇㄛ', 'mwo'),
|
116 |
+
('ㄈㄛ', 'fwo'),
|
117 |
+
('ㄅ', 'p⁼'),
|
118 |
+
('ㄆ', 'pʰ'),
|
119 |
+
('ㄇ', 'm'),
|
120 |
+
('ㄈ', 'f'),
|
121 |
+
('ㄉ', 't⁼'),
|
122 |
+
('ㄊ', 'tʰ'),
|
123 |
+
('ㄋ', 'n'),
|
124 |
+
('ㄌ', 'l'),
|
125 |
+
('ㄍ', 'k⁼'),
|
126 |
+
('ㄎ', 'kʰ'),
|
127 |
+
('ㄏ', 'x'),
|
128 |
+
('ㄐ', 'tʃ⁼'),
|
129 |
+
('ㄑ', 'tʃʰ'),
|
130 |
+
('ㄒ', 'ʃ'),
|
131 |
+
('ㄓ', 'ts`⁼'),
|
132 |
+
('ㄔ', 'ts`ʰ'),
|
133 |
+
('ㄕ', 's`'),
|
134 |
+
('ㄖ', 'ɹ`'),
|
135 |
+
('ㄗ', 'ts⁼'),
|
136 |
+
('ㄘ', 'tsʰ'),
|
137 |
+
('ㄙ', 's'),
|
138 |
+
('ㄚ', 'a'),
|
139 |
+
('ㄛ', 'o'),
|
140 |
+
('ㄜ', 'ə'),
|
141 |
+
('ㄝ', 'ɛ'),
|
142 |
+
('ㄞ', 'aɪ'),
|
143 |
+
('ㄟ', 'eɪ'),
|
144 |
+
('ㄠ', 'ɑʊ'),
|
145 |
+
('ㄡ', 'oʊ'),
|
146 |
+
('ㄧㄢ', 'jɛn'),
|
147 |
+
('ㄩㄢ', 'ɥæn'),
|
148 |
+
('ㄢ', 'an'),
|
149 |
+
('ㄧㄣ', 'in'),
|
150 |
+
('ㄩㄣ', 'ɥn'),
|
151 |
+
('ㄣ', 'ən'),
|
152 |
+
('ㄤ', 'ɑŋ'),
|
153 |
+
('ㄧㄥ', 'iŋ'),
|
154 |
+
('ㄨㄥ', 'ʊŋ'),
|
155 |
+
('ㄩㄥ', 'jʊŋ'),
|
156 |
+
('ㄥ', 'əŋ'),
|
157 |
+
('ㄦ', 'əɻ'),
|
158 |
+
('ㄧ', 'i'),
|
159 |
+
('ㄨ', 'u'),
|
160 |
+
('ㄩ', 'ɥ'),
|
161 |
+
('ˉ', '→'),
|
162 |
+
('ˊ', '↑'),
|
163 |
+
('ˇ', '↓↑'),
|
164 |
+
('ˋ', '↓'),
|
165 |
+
('˙', ''),
|
166 |
+
(',', ','),
|
167 |
+
('。', '.'),
|
168 |
+
('!', '!'),
|
169 |
+
('?', '?'),
|
170 |
+
('—', '-')
|
171 |
+
]]
|
172 |
+
|
173 |
+
# List of (bopomofo, ipa2) pairs:
|
174 |
+
_bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
|
175 |
+
('ㄅㄛ', 'pwo'),
|
176 |
+
('ㄆㄛ', 'pʰwo'),
|
177 |
+
('ㄇㄛ', 'mwo'),
|
178 |
+
('ㄈㄛ', 'fwo'),
|
179 |
+
('ㄅ', 'p'),
|
180 |
+
('ㄆ', 'pʰ'),
|
181 |
+
('ㄇ', 'm'),
|
182 |
+
('ㄈ', 'f'),
|
183 |
+
('ㄉ', 't'),
|
184 |
+
('ㄊ', 'tʰ'),
|
185 |
+
('ㄋ', 'n'),
|
186 |
+
('ㄌ', 'l'),
|
187 |
+
('ㄍ', 'k'),
|
188 |
+
('ㄎ', 'kʰ'),
|
189 |
+
('ㄏ', 'h'),
|
190 |
+
('ㄐ', 'tɕ'),
|
191 |
+
('ㄑ', 'tɕʰ'),
|
192 |
+
('ㄒ', 'ɕ'),
|
193 |
+
('ㄓ', 'tʂ'),
|
194 |
+
('ㄔ', 'tʂʰ'),
|
195 |
+
('ㄕ', 'ʂ'),
|
196 |
+
('ㄖ', 'ɻ'),
|
197 |
+
('ㄗ', 'ts'),
|
198 |
+
('ㄘ', 'tsʰ'),
|
199 |
+
('ㄙ', 's'),
|
200 |
+
('ㄚ', 'a'),
|
201 |
+
('ㄛ', 'o'),
|
202 |
+
('ㄜ', 'ɤ'),
|
203 |
+
('ㄝ', 'ɛ'),
|
204 |
+
('ㄞ', 'aɪ'),
|
205 |
+
('ㄟ', 'eɪ'),
|
206 |
+
('ㄠ', 'ɑʊ'),
|
207 |
+
('ㄡ', 'oʊ'),
|
208 |
+
('ㄧㄢ', 'jɛn'),
|
209 |
+
('ㄩㄢ', 'yæn'),
|
210 |
+
('ㄢ', 'an'),
|
211 |
+
('ㄧㄣ', 'in'),
|
212 |
+
('ㄩㄣ', 'yn'),
|
213 |
+
('ㄣ', 'ən'),
|
214 |
+
('ㄤ', 'ɑŋ'),
|
215 |
+
('ㄧㄥ', 'iŋ'),
|
216 |
+
('ㄨㄥ', 'ʊŋ'),
|
217 |
+
('ㄩㄥ', 'jʊŋ'),
|
218 |
+
('ㄥ', 'ɤŋ'),
|
219 |
+
('ㄦ', 'əɻ'),
|
220 |
+
('ㄧ', 'i'),
|
221 |
+
('ㄨ', 'u'),
|
222 |
+
('ㄩ', 'y'),
|
223 |
+
('ˉ', '˥'),
|
224 |
+
('ˊ', '˧˥'),
|
225 |
+
('ˇ', '˨˩˦'),
|
226 |
+
('ˋ', '˥˩'),
|
227 |
+
('˙', ''),
|
228 |
+
(',', ','),
|
229 |
+
('。', '.'),
|
230 |
+
('!', '!'),
|
231 |
+
('?', '?'),
|
232 |
+
('—', '-')
|
233 |
+
]]
|
234 |
+
|
235 |
+
|
236 |
+
def number_to_chinese(text):
|
237 |
+
numbers = re.findall(r'\d+(?:\.?\d+)?', text)
|
238 |
+
for number in numbers:
|
239 |
+
text = text.replace(number, cn2an.an2cn(number), 1)
|
240 |
+
return text
|
241 |
+
|
242 |
+
|
243 |
+
def chinese_to_bopomofo(text):
|
244 |
+
text = text.replace('、', ',').replace(';', ',').replace(':', ',')
|
245 |
+
words = jieba.lcut(text, cut_all=False)
|
246 |
+
text = ''
|
247 |
+
for word in words:
|
248 |
+
bopomofos = lazy_pinyin(word, BOPOMOFO)
|
249 |
+
if not re.search('[\u4e00-\u9fff]', word):
|
250 |
+
text += word
|
251 |
+
continue
|
252 |
+
for i in range(len(bopomofos)):
|
253 |
+
bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
|
254 |
+
if text != '':
|
255 |
+
text += ' '
|
256 |
+
text += ''.join(bopomofos)
|
257 |
+
return text
|
258 |
+
|
259 |
+
|
260 |
+
def latin_to_bopomofo(text):
|
261 |
+
for regex, replacement in _latin_to_bopomofo:
|
262 |
+
text = re.sub(regex, replacement, text)
|
263 |
+
return text
|
264 |
+
|
265 |
+
|
266 |
+
def bopomofo_to_romaji(text):
|
267 |
+
for regex, replacement in _bopomofo_to_romaji:
|
268 |
+
text = re.sub(regex, replacement, text)
|
269 |
+
return text
|
270 |
+
|
271 |
+
|
272 |
+
def bopomofo_to_ipa(text):
|
273 |
+
for regex, replacement in _bopomofo_to_ipa:
|
274 |
+
text = re.sub(regex, replacement, text)
|
275 |
+
return text
|
276 |
+
|
277 |
+
|
278 |
+
def bopomofo_to_ipa2(text):
|
279 |
+
for regex, replacement in _bopomofo_to_ipa2:
|
280 |
+
text = re.sub(regex, replacement, text)
|
281 |
+
return text
|
282 |
+
|
283 |
+
|
284 |
+
def chinese_to_romaji(text):
|
285 |
+
text = number_to_chinese(text)
|
286 |
+
text = chinese_to_bopomofo(text)
|
287 |
+
text = latin_to_bopomofo(text)
|
288 |
+
text = bopomofo_to_romaji(text)
|
289 |
+
text = re.sub('i([aoe])', r'y\1', text)
|
290 |
+
text = re.sub('u([aoəe])', r'w\1', text)
|
291 |
+
text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
|
292 |
+
r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
|
293 |
+
text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
|
294 |
+
return text
|
295 |
+
|
296 |
+
|
297 |
+
def chinese_to_lazy_ipa(text):
|
298 |
+
text = chinese_to_romaji(text)
|
299 |
+
for regex, replacement in _romaji_to_ipa:
|
300 |
+
text = re.sub(regex, replacement, text)
|
301 |
+
return text
|
302 |
+
|
303 |
+
|
304 |
+
def chinese_to_ipa(text):
|
305 |
+
text = number_to_chinese(text)
|
306 |
+
text = chinese_to_bopomofo(text)
|
307 |
+
text = latin_to_bopomofo(text)
|
308 |
+
text = bopomofo_to_ipa(text)
|
309 |
+
text = re.sub('i([aoe])', r'j\1', text)
|
310 |
+
text = re.sub('u([aoəe])', r'w\1', text)
|
311 |
+
text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
|
312 |
+
r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
|
313 |
+
text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
|
314 |
+
return text
|
315 |
+
|
316 |
+
|
317 |
+
def chinese_to_ipa2(text):
|
318 |
+
text = number_to_chinese(text)
|
319 |
+
text = chinese_to_bopomofo(text)
|
320 |
+
text = latin_to_bopomofo(text)
|
321 |
+
text = bopomofo_to_ipa2(text)
|
322 |
+
text = re.sub(r'i([aoe])', r'j\1', text)
|
323 |
+
text = re.sub(r'u([aoəe])', r'w\1', text)
|
324 |
+
text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
|
325 |
+
text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
|
326 |
+
return text
|
dreamvoice/train_utils/src/openvoice/text/symbols.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
'''
|
2 |
+
Defines the set of symbols used in text input to the model.
|
3 |
+
'''
|
4 |
+
|
5 |
+
# japanese_cleaners
|
6 |
+
# _pad = '_'
|
7 |
+
# _punctuation = ',.!?-'
|
8 |
+
# _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
|
9 |
+
|
10 |
+
|
11 |
+
'''# japanese_cleaners2
|
12 |
+
_pad = '_'
|
13 |
+
_punctuation = ',.!?-~…'
|
14 |
+
_letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
|
15 |
+
'''
|
16 |
+
|
17 |
+
|
18 |
+
'''# korean_cleaners
|
19 |
+
_pad = '_'
|
20 |
+
_punctuation = ',.!?…~'
|
21 |
+
_letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
|
22 |
+
'''
|
23 |
+
|
24 |
+
'''# chinese_cleaners
|
25 |
+
_pad = '_'
|
26 |
+
_punctuation = ',。!?—…'
|
27 |
+
_letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
|
28 |
+
'''
|
29 |
+
|
30 |
+
# # zh_ja_mixture_cleaners
|
31 |
+
# _pad = '_'
|
32 |
+
# _punctuation = ',.!?-~…'
|
33 |
+
# _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
|
34 |
+
|
35 |
+
|
36 |
+
'''# sanskrit_cleaners
|
37 |
+
_pad = '_'
|
38 |
+
_punctuation = '।'
|
39 |
+
_letters = 'ँंःअआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहऽािीुूृॄेैोौ्ॠॢ '
|
40 |
+
'''
|
41 |
+
|
42 |
+
'''# cjks_cleaners
|
43 |
+
_pad = '_'
|
44 |
+
_punctuation = ',.!?-~…'
|
45 |
+
_letters = 'NQabdefghijklmnopstuvwxyzʃʧʥʦɯɹəɥçɸɾβŋɦː⁼ʰ`^#*=→↓↑ '
|
46 |
+
'''
|
47 |
+
|
48 |
+
'''# thai_cleaners
|
49 |
+
_pad = '_'
|
50 |
+
_punctuation = '.!? '
|
51 |
+
_letters = 'กขฃคฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลวศษสหฬอฮฯะัาำิีึืุูเแโใไๅๆ็่้๊๋์'
|
52 |
+
'''
|
53 |
+
|
54 |
+
# # cjke_cleaners2
|
55 |
+
_pad = '_'
|
56 |
+
_punctuation = ',.!?-~…'
|
57 |
+
_letters = 'NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ '
|
58 |
+
|
59 |
+
|
60 |
+
'''# shanghainese_cleaners
|
61 |
+
_pad = '_'
|
62 |
+
_punctuation = ',.!?…'
|
63 |
+
_letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
|
64 |
+
'''
|
65 |
+
|
66 |
+
'''# chinese_dialect_cleaners
|
67 |
+
_pad = '_'
|
68 |
+
_punctuation = ',.!?~…─'
|
69 |
+
_letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚ᴀᴇ↑↓∅ⱼ '
|
70 |
+
'''
|
71 |
+
|
72 |
+
# Export all symbols:
|
73 |
+
symbols = [_pad] + list(_punctuation) + list(_letters)
|
74 |
+
|
75 |
+
# Special symbol ids
|
76 |
+
SPACE_ID = symbols.index(" ")
|
77 |
+
|
78 |
+
num_ja_tones = 1
|
79 |
+
num_kr_tones = 1
|
80 |
+
num_zh_tones = 6
|
81 |
+
num_en_tones = 4
|
82 |
+
|
83 |
+
language_tone_start_map = {
|
84 |
+
"ZH": 0,
|
85 |
+
"JP": num_zh_tones,
|
86 |
+
"EN": num_zh_tones + num_ja_tones,
|
87 |
+
'KR': num_zh_tones + num_ja_tones + num_en_tones,
|
88 |
+
}
|
dreamvoice/train_utils/src/openvoice/transforms.py
ADDED
@@ -0,0 +1,209 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
from torch.nn import functional as F
|
3 |
+
|
4 |
+
import numpy as np
|
5 |
+
|
6 |
+
|
7 |
+
DEFAULT_MIN_BIN_WIDTH = 1e-3
|
8 |
+
DEFAULT_MIN_BIN_HEIGHT = 1e-3
|
9 |
+
DEFAULT_MIN_DERIVATIVE = 1e-3
|
10 |
+
|
11 |
+
|
12 |
+
def piecewise_rational_quadratic_transform(
|
13 |
+
inputs,
|
14 |
+
unnormalized_widths,
|
15 |
+
unnormalized_heights,
|
16 |
+
unnormalized_derivatives,
|
17 |
+
inverse=False,
|
18 |
+
tails=None,
|
19 |
+
tail_bound=1.0,
|
20 |
+
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
21 |
+
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
22 |
+
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
23 |
+
):
|
24 |
+
if tails is None:
|
25 |
+
spline_fn = rational_quadratic_spline
|
26 |
+
spline_kwargs = {}
|
27 |
+
else:
|
28 |
+
spline_fn = unconstrained_rational_quadratic_spline
|
29 |
+
spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
|
30 |
+
|
31 |
+
outputs, logabsdet = spline_fn(
|
32 |
+
inputs=inputs,
|
33 |
+
unnormalized_widths=unnormalized_widths,
|
34 |
+
unnormalized_heights=unnormalized_heights,
|
35 |
+
unnormalized_derivatives=unnormalized_derivatives,
|
36 |
+
inverse=inverse,
|
37 |
+
min_bin_width=min_bin_width,
|
38 |
+
min_bin_height=min_bin_height,
|
39 |
+
min_derivative=min_derivative,
|
40 |
+
**spline_kwargs
|
41 |
+
)
|
42 |
+
return outputs, logabsdet
|
43 |
+
|
44 |
+
|
45 |
+
def searchsorted(bin_locations, inputs, eps=1e-6):
|
46 |
+
bin_locations[..., -1] += eps
|
47 |
+
return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
|
48 |
+
|
49 |
+
|
50 |
+
def unconstrained_rational_quadratic_spline(
|
51 |
+
inputs,
|
52 |
+
unnormalized_widths,
|
53 |
+
unnormalized_heights,
|
54 |
+
unnormalized_derivatives,
|
55 |
+
inverse=False,
|
56 |
+
tails="linear",
|
57 |
+
tail_bound=1.0,
|
58 |
+
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
59 |
+
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
60 |
+
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
61 |
+
):
|
62 |
+
inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
|
63 |
+
outside_interval_mask = ~inside_interval_mask
|
64 |
+
|
65 |
+
outputs = torch.zeros_like(inputs)
|
66 |
+
logabsdet = torch.zeros_like(inputs)
|
67 |
+
|
68 |
+
if tails == "linear":
|
69 |
+
unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
|
70 |
+
constant = np.log(np.exp(1 - min_derivative) - 1)
|
71 |
+
unnormalized_derivatives[..., 0] = constant
|
72 |
+
unnormalized_derivatives[..., -1] = constant
|
73 |
+
|
74 |
+
outputs[outside_interval_mask] = inputs[outside_interval_mask]
|
75 |
+
logabsdet[outside_interval_mask] = 0
|
76 |
+
else:
|
77 |
+
raise RuntimeError("{} tails are not implemented.".format(tails))
|
78 |
+
|
79 |
+
(
|
80 |
+
outputs[inside_interval_mask],
|
81 |
+
logabsdet[inside_interval_mask],
|
82 |
+
) = rational_quadratic_spline(
|
83 |
+
inputs=inputs[inside_interval_mask],
|
84 |
+
unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
|
85 |
+
unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
|
86 |
+
unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
|
87 |
+
inverse=inverse,
|
88 |
+
left=-tail_bound,
|
89 |
+
right=tail_bound,
|
90 |
+
bottom=-tail_bound,
|
91 |
+
top=tail_bound,
|
92 |
+
min_bin_width=min_bin_width,
|
93 |
+
min_bin_height=min_bin_height,
|
94 |
+
min_derivative=min_derivative,
|
95 |
+
)
|
96 |
+
|
97 |
+
return outputs, logabsdet
|
98 |
+
|
99 |
+
|
100 |
+
def rational_quadratic_spline(
|
101 |
+
inputs,
|
102 |
+
unnormalized_widths,
|
103 |
+
unnormalized_heights,
|
104 |
+
unnormalized_derivatives,
|
105 |
+
inverse=False,
|
106 |
+
left=0.0,
|
107 |
+
right=1.0,
|
108 |
+
bottom=0.0,
|
109 |
+
top=1.0,
|
110 |
+
min_bin_width=DEFAULT_MIN_BIN_WIDTH,
|
111 |
+
min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
|
112 |
+
min_derivative=DEFAULT_MIN_DERIVATIVE,
|
113 |
+
):
|
114 |
+
if torch.min(inputs) < left or torch.max(inputs) > right:
|
115 |
+
raise ValueError("Input to a transform is not within its domain")
|
116 |
+
|
117 |
+
num_bins = unnormalized_widths.shape[-1]
|
118 |
+
|
119 |
+
if min_bin_width * num_bins > 1.0:
|
120 |
+
raise ValueError("Minimal bin width too large for the number of bins")
|
121 |
+
if min_bin_height * num_bins > 1.0:
|
122 |
+
raise ValueError("Minimal bin height too large for the number of bins")
|
123 |
+
|
124 |
+
widths = F.softmax(unnormalized_widths, dim=-1)
|
125 |
+
widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
|
126 |
+
cumwidths = torch.cumsum(widths, dim=-1)
|
127 |
+
cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
|
128 |
+
cumwidths = (right - left) * cumwidths + left
|
129 |
+
cumwidths[..., 0] = left
|
130 |
+
cumwidths[..., -1] = right
|
131 |
+
widths = cumwidths[..., 1:] - cumwidths[..., :-1]
|
132 |
+
|
133 |
+
derivatives = min_derivative + F.softplus(unnormalized_derivatives)
|
134 |
+
|
135 |
+
heights = F.softmax(unnormalized_heights, dim=-1)
|
136 |
+
heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
|
137 |
+
cumheights = torch.cumsum(heights, dim=-1)
|
138 |
+
cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
|
139 |
+
cumheights = (top - bottom) * cumheights + bottom
|
140 |
+
cumheights[..., 0] = bottom
|
141 |
+
cumheights[..., -1] = top
|
142 |
+
heights = cumheights[..., 1:] - cumheights[..., :-1]
|
143 |
+
|
144 |
+
if inverse:
|
145 |
+
bin_idx = searchsorted(cumheights, inputs)[..., None]
|
146 |
+
else:
|
147 |
+
bin_idx = searchsorted(cumwidths, inputs)[..., None]
|
148 |
+
|
149 |
+
input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
|
150 |
+
input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
|
151 |
+
|
152 |
+
input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
|
153 |
+
delta = heights / widths
|
154 |
+
input_delta = delta.gather(-1, bin_idx)[..., 0]
|
155 |
+
|
156 |
+
input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
|
157 |
+
input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
|
158 |
+
|
159 |
+
input_heights = heights.gather(-1, bin_idx)[..., 0]
|
160 |
+
|
161 |
+
if inverse:
|
162 |
+
a = (inputs - input_cumheights) * (
|
163 |
+
input_derivatives + input_derivatives_plus_one - 2 * input_delta
|
164 |
+
) + input_heights * (input_delta - input_derivatives)
|
165 |
+
b = input_heights * input_derivatives - (inputs - input_cumheights) * (
|
166 |
+
input_derivatives + input_derivatives_plus_one - 2 * input_delta
|
167 |
+
)
|
168 |
+
c = -input_delta * (inputs - input_cumheights)
|
169 |
+
|
170 |
+
discriminant = b.pow(2) - 4 * a * c
|
171 |
+
assert (discriminant >= 0).all()
|
172 |
+
|
173 |
+
root = (2 * c) / (-b - torch.sqrt(discriminant))
|
174 |
+
outputs = root * input_bin_widths + input_cumwidths
|
175 |
+
|
176 |
+
theta_one_minus_theta = root * (1 - root)
|
177 |
+
denominator = input_delta + (
|
178 |
+
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
179 |
+
* theta_one_minus_theta
|
180 |
+
)
|
181 |
+
derivative_numerator = input_delta.pow(2) * (
|
182 |
+
input_derivatives_plus_one * root.pow(2)
|
183 |
+
+ 2 * input_delta * theta_one_minus_theta
|
184 |
+
+ input_derivatives * (1 - root).pow(2)
|
185 |
+
)
|
186 |
+
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
187 |
+
|
188 |
+
return outputs, -logabsdet
|
189 |
+
else:
|
190 |
+
theta = (inputs - input_cumwidths) / input_bin_widths
|
191 |
+
theta_one_minus_theta = theta * (1 - theta)
|
192 |
+
|
193 |
+
numerator = input_heights * (
|
194 |
+
input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
|
195 |
+
)
|
196 |
+
denominator = input_delta + (
|
197 |
+
(input_derivatives + input_derivatives_plus_one - 2 * input_delta)
|
198 |
+
* theta_one_minus_theta
|
199 |
+
)
|
200 |
+
outputs = input_cumheights + numerator / denominator
|
201 |
+
|
202 |
+
derivative_numerator = input_delta.pow(2) * (
|
203 |
+
input_derivatives_plus_one * theta.pow(2)
|
204 |
+
+ 2 * input_delta * theta_one_minus_theta
|
205 |
+
+ input_derivatives * (1 - theta).pow(2)
|
206 |
+
)
|
207 |
+
logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
|
208 |
+
|
209 |
+
return outputs, logabsdet
|
dreamvoice/train_utils/src/openvoice/utils.py
ADDED
@@ -0,0 +1,194 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import json
|
3 |
+
import numpy as np
|
4 |
+
|
5 |
+
|
6 |
+
def get_hparams_from_file(config_path):
|
7 |
+
with open(config_path, "r", encoding="utf-8") as f:
|
8 |
+
data = f.read()
|
9 |
+
config = json.loads(data)
|
10 |
+
|
11 |
+
hparams = HParams(**config)
|
12 |
+
return hparams
|
13 |
+
|
14 |
+
class HParams:
|
15 |
+
def __init__(self, **kwargs):
|
16 |
+
for k, v in kwargs.items():
|
17 |
+
if type(v) == dict:
|
18 |
+
v = HParams(**v)
|
19 |
+
self[k] = v
|
20 |
+
|
21 |
+
def keys(self):
|
22 |
+
return self.__dict__.keys()
|
23 |
+
|
24 |
+
def items(self):
|
25 |
+
return self.__dict__.items()
|
26 |
+
|
27 |
+
def values(self):
|
28 |
+
return self.__dict__.values()
|
29 |
+
|
30 |
+
def __len__(self):
|
31 |
+
return len(self.__dict__)
|
32 |
+
|
33 |
+
def __getitem__(self, key):
|
34 |
+
return getattr(self, key)
|
35 |
+
|
36 |
+
def __setitem__(self, key, value):
|
37 |
+
return setattr(self, key, value)
|
38 |
+
|
39 |
+
def __contains__(self, key):
|
40 |
+
return key in self.__dict__
|
41 |
+
|
42 |
+
def __repr__(self):
|
43 |
+
return self.__dict__.__repr__()
|
44 |
+
|
45 |
+
|
46 |
+
def string_to_bits(string, pad_len=8):
|
47 |
+
# Convert each character to its ASCII value
|
48 |
+
ascii_values = [ord(char) for char in string]
|
49 |
+
|
50 |
+
# Convert ASCII values to binary representation
|
51 |
+
binary_values = [bin(value)[2:].zfill(8) for value in ascii_values]
|
52 |
+
|
53 |
+
# Convert binary strings to integer arrays
|
54 |
+
bit_arrays = [[int(bit) for bit in binary] for binary in binary_values]
|
55 |
+
|
56 |
+
# Convert list of arrays to NumPy array
|
57 |
+
numpy_array = np.array(bit_arrays)
|
58 |
+
numpy_array_full = np.zeros((pad_len, 8), dtype=numpy_array.dtype)
|
59 |
+
numpy_array_full[:, 2] = 1
|
60 |
+
max_len = min(pad_len, len(numpy_array))
|
61 |
+
numpy_array_full[:max_len] = numpy_array[:max_len]
|
62 |
+
return numpy_array_full
|
63 |
+
|
64 |
+
|
65 |
+
def bits_to_string(bits_array):
|
66 |
+
# Convert each row of the array to a binary string
|
67 |
+
binary_values = [''.join(str(bit) for bit in row) for row in bits_array]
|
68 |
+
|
69 |
+
# Convert binary strings to ASCII values
|
70 |
+
ascii_values = [int(binary, 2) for binary in binary_values]
|
71 |
+
|
72 |
+
# Convert ASCII values to characters
|
73 |
+
output_string = ''.join(chr(value) for value in ascii_values)
|
74 |
+
|
75 |
+
return output_string
|
76 |
+
|
77 |
+
|
78 |
+
def split_sentence(text, min_len=10, language_str='[EN]'):
|
79 |
+
if language_str in ['EN']:
|
80 |
+
sentences = split_sentences_latin(text, min_len=min_len)
|
81 |
+
else:
|
82 |
+
sentences = split_sentences_zh(text, min_len=min_len)
|
83 |
+
return sentences
|
84 |
+
|
85 |
+
def split_sentences_latin(text, min_len=10):
|
86 |
+
"""Split Long sentences into list of short ones
|
87 |
+
|
88 |
+
Args:
|
89 |
+
str: Input sentences.
|
90 |
+
|
91 |
+
Returns:
|
92 |
+
List[str]: list of output sentences.
|
93 |
+
"""
|
94 |
+
# deal with dirty sentences
|
95 |
+
text = re.sub('[。!?;]', '.', text)
|
96 |
+
text = re.sub('[,]', ',', text)
|
97 |
+
text = re.sub('[“”]', '"', text)
|
98 |
+
text = re.sub('[‘’]', "'", text)
|
99 |
+
text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
|
100 |
+
text = re.sub('[\n\t ]+', ' ', text)
|
101 |
+
text = re.sub('([,.!?;])', r'\1 $#!', text)
|
102 |
+
# split
|
103 |
+
sentences = [s.strip() for s in text.split('$#!')]
|
104 |
+
if len(sentences[-1]) == 0: del sentences[-1]
|
105 |
+
|
106 |
+
new_sentences = []
|
107 |
+
new_sent = []
|
108 |
+
count_len = 0
|
109 |
+
for ind, sent in enumerate(sentences):
|
110 |
+
# print(sent)
|
111 |
+
new_sent.append(sent)
|
112 |
+
count_len += len(sent.split(" "))
|
113 |
+
if count_len > min_len or ind == len(sentences) - 1:
|
114 |
+
count_len = 0
|
115 |
+
new_sentences.append(' '.join(new_sent))
|
116 |
+
new_sent = []
|
117 |
+
return merge_short_sentences_latin(new_sentences)
|
118 |
+
|
119 |
+
|
120 |
+
def merge_short_sentences_latin(sens):
|
121 |
+
"""Avoid short sentences by merging them with the following sentence.
|
122 |
+
|
123 |
+
Args:
|
124 |
+
List[str]: list of input sentences.
|
125 |
+
|
126 |
+
Returns:
|
127 |
+
List[str]: list of output sentences.
|
128 |
+
"""
|
129 |
+
sens_out = []
|
130 |
+
for s in sens:
|
131 |
+
# If the previous sentence is too short, merge them with
|
132 |
+
# the current sentence.
|
133 |
+
if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
|
134 |
+
sens_out[-1] = sens_out[-1] + " " + s
|
135 |
+
else:
|
136 |
+
sens_out.append(s)
|
137 |
+
try:
|
138 |
+
if len(sens_out[-1].split(" ")) <= 2:
|
139 |
+
sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
|
140 |
+
sens_out.pop(-1)
|
141 |
+
except:
|
142 |
+
pass
|
143 |
+
return sens_out
|
144 |
+
|
145 |
+
def split_sentences_zh(text, min_len=10):
|
146 |
+
text = re.sub('[。!?;]', '.', text)
|
147 |
+
text = re.sub('[,]', ',', text)
|
148 |
+
# 将文本中的换行符、空格和制表符替换为空格
|
149 |
+
text = re.sub('[\n\t ]+', ' ', text)
|
150 |
+
# 在标点符号后添加一个空格
|
151 |
+
text = re.sub('([,.!?;])', r'\1 $#!', text)
|
152 |
+
# 分隔句子并去除前后空格
|
153 |
+
# sentences = [s.strip() for s in re.split('(。|!|?|;)', text)]
|
154 |
+
sentences = [s.strip() for s in text.split('$#!')]
|
155 |
+
if len(sentences[-1]) == 0: del sentences[-1]
|
156 |
+
|
157 |
+
new_sentences = []
|
158 |
+
new_sent = []
|
159 |
+
count_len = 0
|
160 |
+
for ind, sent in enumerate(sentences):
|
161 |
+
new_sent.append(sent)
|
162 |
+
count_len += len(sent)
|
163 |
+
if count_len > min_len or ind == len(sentences) - 1:
|
164 |
+
count_len = 0
|
165 |
+
new_sentences.append(' '.join(new_sent))
|
166 |
+
new_sent = []
|
167 |
+
return merge_short_sentences_zh(new_sentences)
|
168 |
+
|
169 |
+
|
170 |
+
def merge_short_sentences_zh(sens):
|
171 |
+
# return sens
|
172 |
+
"""Avoid short sentences by merging them with the following sentence.
|
173 |
+
|
174 |
+
Args:
|
175 |
+
List[str]: list of input sentences.
|
176 |
+
|
177 |
+
Returns:
|
178 |
+
List[str]: list of output sentences.
|
179 |
+
"""
|
180 |
+
sens_out = []
|
181 |
+
for s in sens:
|
182 |
+
# If the previous sentense is too short, merge them with
|
183 |
+
# the current sentence.
|
184 |
+
if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
|
185 |
+
sens_out[-1] = sens_out[-1] + " " + s
|
186 |
+
else:
|
187 |
+
sens_out.append(s)
|
188 |
+
try:
|
189 |
+
if len(sens_out[-1]) <= 2:
|
190 |
+
sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
|
191 |
+
sens_out.pop(-1)
|
192 |
+
except:
|
193 |
+
pass
|
194 |
+
return sens_out
|