Higobeatz commited on
Commit
bd3a23c
·
1 Parent(s): ca0bd50

openvoice plugin

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +13 -0
  2. dreamvoice/train_utils/prepare/get_dist.py +49 -0
  3. dreamvoice/train_utils/prepare/plugin_meta.csv +0 -0
  4. dreamvoice/train_utils/prepare/prepare_se.py +101 -0
  5. dreamvoice/train_utils/prepare/prompts.csv +0 -0
  6. dreamvoice/train_utils/prepare/val_meta.csv +121 -0
  7. dreamvoice/train_utils/src/configs/plugin.py +44 -0
  8. dreamvoice/train_utils/src/dataset/__init__.py +1 -0
  9. dreamvoice/train_utils/src/dataset/dreamvc.py +36 -0
  10. dreamvoice/train_utils/src/inference.py +114 -0
  11. dreamvoice/train_utils/src/model/p2e_cross.py +80 -0
  12. dreamvoice/train_utils/src/model/p2e_cross.yaml +26 -0
  13. dreamvoice/train_utils/src/modules/speaker_encoder/LICENSE +24 -0
  14. dreamvoice/train_utils/src/modules/speaker_encoder/README.md +64 -0
  15. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/__init__.py +1 -0
  16. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/audio.py +157 -0
  17. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/config.py +47 -0
  18. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/__init__.py +4 -0
  19. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py +39 -0
  20. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/speaker.py +42 -0
  21. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py +14 -0
  22. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py +58 -0
  23. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/utterance.py +28 -0
  24. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/inference.py +211 -0
  25. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/model.py +137 -0
  26. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/params_data.py +30 -0
  27. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/params_model.py +12 -0
  28. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/preprocess.py +177 -0
  29. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/train.py +127 -0
  30. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/__init__.py +1 -0
  31. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/argutils.py +42 -0
  32. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/logmmse.py +222 -0
  33. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/profiler.py +47 -0
  34. dreamvoice/train_utils/src/modules/speaker_encoder/encoder/visualizations.py +180 -0
  35. dreamvoice/train_utils/src/openvoice/__init__.py +0 -0
  36. dreamvoice/train_utils/src/openvoice/api.py +202 -0
  37. dreamvoice/train_utils/src/openvoice/attentions.py +465 -0
  38. dreamvoice/train_utils/src/openvoice/commons.py +160 -0
  39. dreamvoice/train_utils/src/openvoice/mel_processing.py +183 -0
  40. dreamvoice/train_utils/src/openvoice/models.py +499 -0
  41. dreamvoice/train_utils/src/openvoice/modules.py +598 -0
  42. dreamvoice/train_utils/src/openvoice/openvoice_app.py +275 -0
  43. dreamvoice/train_utils/src/openvoice/se_extractor.py +153 -0
  44. dreamvoice/train_utils/src/openvoice/text/__init__.py +79 -0
  45. dreamvoice/train_utils/src/openvoice/text/cleaners.py +16 -0
  46. dreamvoice/train_utils/src/openvoice/text/english.py +188 -0
  47. dreamvoice/train_utils/src/openvoice/text/mandarin.py +326 -0
  48. dreamvoice/train_utils/src/openvoice/text/symbols.py +88 -0
  49. dreamvoice/train_utils/src/openvoice/transforms.py +209 -0
  50. dreamvoice/train_utils/src/openvoice/utils.py +194 -0
.gitignore ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Ignore Jupyter Notebook checkpoints
2
+ .ipynb_checkpoints/
3
+
4
+ # Ignore Python bytecode files
5
+ *.pyc
6
+ *.pyo
7
+ *.pyd
8
+ __pycache__/
9
+
10
+ # Ignore virtual environments
11
+ venv/
12
+ env/
13
+ .virtualenv/
dreamvoice/train_utils/prepare/get_dist.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import random
4
+ import numpy as np
5
+
6
+
7
+ # Function to recursively find all .pt files in a directory
8
+ def find_pt_files(root_dir):
9
+ pt_files = []
10
+ for dirpath, _, filenames in os.walk(root_dir):
11
+ for file in filenames:
12
+ if file.endswith('.pt'):
13
+ pt_files.append(os.path.join(dirpath, file))
14
+ return pt_files
15
+
16
+
17
+ # Function to compute statistics for a given tensor list
18
+ def compute_statistics(tensor_list):
19
+ all_data = torch.cat(tensor_list)
20
+ mean = torch.mean(all_data).item()
21
+ std = torch.std(all_data).item()
22
+ max_val = torch.max(all_data).item()
23
+ min_val = torch.min(all_data).item()
24
+ return mean, std, max_val, min_val
25
+
26
+
27
+ # Root directory containing .pt files in subfolders
28
+ root_dir = "spk"
29
+
30
+ # Find all .pt files
31
+ pt_files = find_pt_files(root_dir)
32
+
33
+ # Randomly sample 1000 .pt files (or fewer if less than 1000 files are available)
34
+ sampled_files = random.sample(pt_files, min(1000, len(pt_files)))
35
+
36
+ # Load tensors from sampled files
37
+ tensor_list = []
38
+ for file in sampled_files:
39
+ tensor = torch.load(file)
40
+ tensor_list.append(tensor.view(-1)) # Flatten the tensor
41
+
42
+ # Compute statistics
43
+ mean, std, max_val, min_val = compute_statistics(tensor_list)
44
+
45
+ # Print the results
46
+ print(f"Mean: {mean}")
47
+ print(f"Std: {std}")
48
+ print(f"Max: {max_val}")
49
+ print(f"Min: {min_val}")
dreamvoice/train_utils/prepare/plugin_meta.csv ADDED
The diff for this file is too large to render. See raw diff
 
dreamvoice/train_utils/prepare/prepare_se.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import librosa
4
+ from tqdm import tqdm
5
+ from openvoice.api import ToneColorConverter
6
+ from openvoice.mel_processing import spectrogram_torch
7
+ from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
8
+
9
+
10
+ @torch.no_grad()
11
+ def se_extractor(audio_path, vc):
12
+ # vad
13
+ SAMPLE_RATE = 16000
14
+ audio_vad = get_audio_tensor(audio_path)
15
+ segments = get_vad_segments(
16
+ audio_vad,
17
+ output_sample=True,
18
+ min_speech_duration=0.1,
19
+ min_silence_duration=1,
20
+ method="silero",
21
+ )
22
+ segments = [(seg["start"], seg["end"]) for seg in segments]
23
+ segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
24
+
25
+ if len(segments) == 0:
26
+ segments = [(0, len(audio_vad)/SAMPLE_RATE)]
27
+ print(segments)
28
+
29
+ # spk
30
+ hps = vc.hps
31
+ device = vc.device
32
+ model = vc.model
33
+ gs = []
34
+
35
+ audio, sr = librosa.load(audio_path, sr=hps.data.sampling_rate)
36
+ audio = torch.tensor(audio).float().to(device)
37
+
38
+ for s, e in segments:
39
+ y = audio[int(hps.data.sampling_rate*s):int(hps.data.sampling_rate*e)]
40
+ y = y.to(device)
41
+ y = y.unsqueeze(0)
42
+ y = spectrogram_torch(y, hps.data.filter_length,
43
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
44
+ center=False).to(device)
45
+ g = model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
46
+ gs.append(g.detach())
47
+
48
+ gs = torch.stack(gs).mean(0)
49
+ return gs.cpu()
50
+
51
+
52
+ def process_audio_folder(input_folder, output_folder, model, device):
53
+ """
54
+ Process all audio files in a folder and its subfolders,
55
+ save the extracted features as .pt files in the output folder with the same structure.
56
+
57
+ Args:
58
+ input_folder (str): Path to the input folder containing audio files.
59
+ output_folder (str): Path to the output folder to save .pt files.
60
+ model: Pre-trained model for feature extraction.
61
+ device: Torch device (e.g., 'cpu' or 'cuda').
62
+ """
63
+ # Collect all audio file paths
64
+ audio_files = []
65
+ for root, _, files in os.walk(input_folder):
66
+ for file in files:
67
+ if file.endswith(('.wav', '.mp3', '.flac')): # Adjust for the audio formats you want to process
68
+ audio_files.append(os.path.join(root, file))
69
+
70
+ # Process each audio file with tqdm for progress
71
+ for audio_path in tqdm(audio_files, desc="Processing audio files", unit="file"):
72
+ # Construct output path
73
+ relative_path = os.path.relpath(os.path.dirname(audio_path), input_folder)
74
+ output_dir = os.path.join(output_folder, relative_path)
75
+ os.makedirs(output_dir, exist_ok=True)
76
+ output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(audio_path))[0] + '.pt')
77
+
78
+ # Check if the .pt file already exists
79
+ if os.path.exists(output_path):
80
+ # print(f"Skipped (already exists): {output_path}")
81
+ continue # Skip processing this file
82
+ # Extract features
83
+ target_se = se_extractor(audio_path, model).to(device)
84
+ # Save the feature as .pt
85
+ torch.save(target_se, output_path)
86
+ # print(f"Processed and saved: {output_path}")
87
+
88
+
89
+ if __name__ == '__main__':
90
+ ckpt_converter = 'checkpoints_v2/converter'
91
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
92
+ model = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
93
+ model.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
94
+
95
+ input_folder = '/home/jerry/Projects/Dataset/VCTK/24k/VCTK-Corpus/'
96
+ output_folder = 'spk/VCTK-Corpus/'
97
+ process_audio_folder(input_folder, output_folder, model, device)
98
+
99
+ input_folder = '/home/jerry/Projects/Dataset/Speech/vctk_libritts/LibriTTS-R/train-clean-360'
100
+ output_folder = 'spk/LibriTTS-R/train-clean-360/'
101
+ process_audio_folder(input_folder, output_folder, model, device)
dreamvoice/train_utils/prepare/prompts.csv ADDED
The diff for this file is too large to render. See raw diff
 
dreamvoice/train_utils/prepare/val_meta.csv ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ path,prompt
2
+ LibriTTS-R/dev-clean/3081/166546/3081_166546_000101_000001.wav,"A gender-ambiguous teenager's voice, bright and smooth, perfect for client and public interaction."
3
+ LibriTTS-R/dev-clean/3081/166546/3081_166546_000028_000002.wav,"A mature male voice, ideal for delivering creative narratives through oral storytelling."
4
+ LibriTTS-R/dev-clean/3081/166546/3081_166546_000101_000001.wav,"An adult man's voice, charming and appealing, perfect for captivating storytelling."
5
+ LibriTTS-R/dev-clean/84/121550/84_121550_000292_000000.wav,"A bright and engaging teenager's voice, suitable for client and public interaction."
6
+ LibriTTS-R/dev-clean/84/121550/84_121550_000303_000000.wav,An elderly gentleman with a smooth and attractive voice.
7
+ LibriTTS-R/dev-clean/84/121123/84_121123_000010_000000.wav,"A middle-aged woman with a bright, light voice."
8
+ LibriTTS-R/dev-clean/5895/34615/5895_34615_000029_000000.wav,A young and gender-neutral teenager's voice.
9
+ LibriTTS-R/dev-clean/5895/34615/5895_34615_000009_000000.wav,A warm and attractive adult male voice.
10
+ LibriTTS-R/dev-clean/5895/34622/5895_34622_000018_000001.wav,"A middle-aged male voice, rough and husky."
11
+ LibriTTS-R/dev-clean/2035/147960/2035_147960_000022_000005.wav,"An elderly male voice with a dark, rough, authoritative, and strong tone."
12
+ LibriTTS-R/dev-clean/2035/152373/2035_152373_000010_000002.wav,"An elderly male voice, exuding warmth and kindness."
13
+ LibriTTS-R/dev-clean/2035/147960/2035_147960_000003_000002.wav,"A mature woman's voice, bright and smooth, ideal for client and public interaction."
14
+ LibriTTS-R/dev-clean/1673/143396/1673_143396_000017_000002.wav,"A deep, rich, and dark voice of an elderly man."
15
+ LibriTTS-R/dev-clean/1673/143397/1673_143397_000016_000002.wav,A middle-aged man with a masculine voice.
16
+ LibriTTS-R/dev-clean/1673/143397/1673_143397_000031_000006.wav,"A teenage girl's voice, bright and cute, yet with a hint of weakness, perfect for client and public interaction."
17
+ LibriTTS-R/dev-clean/2803/154328/2803_154328_000019_000000.wav,"An older man's voice, deep and rich, exuding charm and allure."
18
+ LibriTTS-R/dev-clean/2803/154320/2803_154320_000007_000000.wav,"An adult female voice that is smooth and captivating, perfect for storytelling."
19
+ LibriTTS-R/dev-clean/2803/154328/2803_154328_000069_000000.wav,An adult woman's weak voice for client and public interaction.
20
+ LibriTTS-R/dev-clean/3752/4944/3752_4944_000097_000000.wav,"An elderly male voice with a deep, silky, and charming tone."
21
+ LibriTTS-R/dev-clean/3752/4944/3752_4944_000009_000000.wav,"A teenage girl's voice, suited for client and public interaction."
22
+ LibriTTS-R/dev-clean/3752/4943/3752_4943_000061_000000.wav,"A mature male voice, authoritative and commanding, ideal for political negotiation and legal discourse."
23
+ LibriTTS-R/dev-clean/1919/142785/1919_142785_000038_000001.wav,"An adult male voice, deep and rich, suited for public speaking engagements."
24
+ LibriTTS-R/dev-clean/1919/142785/1919_142785_000012_000001.wav,"A mature, gender-ambiguous adult voice that is silky and mellow."
25
+ LibriTTS-R/dev-clean/1919/142785/1919_142785_000035_000003.wav,"A teenage girl's voice, bright and captivating for storytelling."
26
+ LibriTTS-R/dev-clean/6313/66129/6313_66129_000074_000003.wav,An androgynous elderly voice.
27
+ LibriTTS-R/dev-clean/6313/76958/6313_76958_000008_000000.wav,"An elderly male voice, deep and rich, with a warm and inviting quality, perfect for storytelling with a heartfelt touch."
28
+ LibriTTS-R/dev-clean/6313/66129/6313_66129_000031_000000.wav,"An adult male voice, dark and captivating, perfect for storytelling."
29
+ LibriTTS-R/dev-clean/652/130737/652_130737_000031_000000.wav,"An adult male voice, dark and attractive, exuding warmth and charm."
30
+ LibriTTS-R/dev-clean/652/130737/652_130737_000031_000000.wav,"A teenage boy with a confident voice, ideal for client and public interaction."
31
+ LibriTTS-R/dev-clean/652/129742/652_129742_000010_000003.wav,"A mature man's voice, deep and resonant with a touch of twanginess, perfect for customer service and public engagement roles."
32
+ LibriTTS-R/dev-clean/2902/9008/2902_9008_000026_000003.wav,"An authoritative senior male voice, dark and strong yet warm and comforting."
33
+ LibriTTS-R/dev-clean/2902/9006/2902_9006_000011_000000.wav,"An elderly male voice with a dark, rough, authoritative, and strong tone."
34
+ LibriTTS-R/dev-clean/2902/9008/2902_9008_000008_000002.wav,An adult male voice that is bright and authoritative.
35
+ LibriTTS-R/dev-clean/7976/105575/7976_105575_000006_000001.wav,"A young male voice, charming and sweet."
36
+ LibriTTS-R/dev-clean/7976/105575/7976_105575_000015_000000.wav,"An elderly man's voice, rough and hoarse."
37
+ LibriTTS-R/dev-clean/7976/110523/7976_110523_000032_000002.wav,"A mature adult female voice, commanding and assertive."
38
+ LibriTTS-R/dev-clean/7850/111771/7850_111771_000006_000000.wav,"An elderly male voice with a deep, rich tone that is inviting and heartfelt."
39
+ LibriTTS-R/dev-clean/7850/281318/7850_281318_000006_000000.wav,A gender-ambiguous teenager's voice.
40
+ LibriTTS-R/dev-clean/7850/281318/7850_281318_000001_000003.wav,"A middle-aged male voice, feeble and faint."
41
+ LibriTTS-R/dev-clean/2086/149220/2086_149220_000045_000002.wav,"An adult male voice, smooth and velvety."
42
+ LibriTTS-R/dev-clean/2086/149220/2086_149220_000006_000012.wav,"An adult woman's voice, bright and smooth, ideal for client and public interaction."
43
+ LibriTTS-R/dev-clean/2086/149214/2086_149214_000004_000003.wav,A senior male voice that is strong and authoritative.
44
+ LibriTTS-R/dev-clean/2412/153947/2412_153947_000017_000005.wav,A mature female voice suited for customer service and public engagement.
45
+ LibriTTS-R/dev-clean/2412/153947/2412_153947_000017_000005.wav,"An adult female voice, bright and warm, perfect for client and public interaction."
46
+ LibriTTS-R/dev-clean/2412/153954/2412_153954_000006_000003.wav,"A senior male voice with a dark, rough texture."
47
+ LibriTTS-R/dev-clean/1988/148538/1988_148538_000011_000000.wav,"An adult male voice, dark, authoritative, and strong, perfect for storytelling."
48
+ LibriTTS-R/dev-clean/1988/147956/1988_147956_000009_000008.wav,"A senior female voice that is smooth, warm, and attractive."
49
+ LibriTTS-R/dev-clean/1988/24833/1988_24833_000009_000003.wav,A youthful voice with an androgynous and gender-neutral quality.
50
+ LibriTTS-R/dev-clean/6319/275224/6319_275224_000022_000008.wav,"A female adult voice, perfect for captivating storytelling."
51
+ LibriTTS-R/dev-clean/6319/275224/6319_275224_000024_000001.wav,A commanding and powerful adult female voice.
52
+ LibriTTS-R/dev-clean/6319/275224/6319_275224_000022_000009.wav,"A senior male voice with a dark, rough texture."
53
+ LibriTTS-R/dev-clean/2428/83705/2428_83705_000025_000000.wav,A man's voice in adulthood.
54
+ LibriTTS-R/dev-clean/2428/83699/2428_83699_000033_000003.wav,"An adult male voice that is warm and attractive, ideal for engaging storytelling."
55
+ LibriTTS-R/dev-clean/2428/83705/2428_83705_000023_000002.wav,"A young and gender-ambiguous teenager with a rough, hoarse voice."
56
+ LibriTTS-R/dev-clean/5536/43359/5536_43359_000003_000002.wav,"A mature male voice with a deep, hoarse quality."
57
+ LibriTTS-R/dev-clean/5536/43359/5536_43359_000023_000000.wav,"A senior man's voice, dark, rough, strong, and authoritative, perfect for storytelling."
58
+ LibriTTS-R/dev-clean/5536/43359/5536_43359_000010_000002.wav,An adult male with a strong voice.
59
+ LibriTTS-R/dev-clean/422/122949/422_122949_000001_000000.wav,"An adult man's bright voice, perfect for storytelling."
60
+ LibriTTS-R/dev-clean/422/122949/422_122949_000013_000010.wav,"An adult woman's voice, smooth and captivating, perfect for storytelling and creative narration."
61
+ LibriTTS-R/dev-clean/422/122949/422_122949_000001_000000.wav,An older male's voice.
62
+ LibriTTS-R/dev-clean/251/137823/251_137823_000056_000002.wav,"A senior male voice, dark, rough, authoritative, and attractive."
63
+ LibriTTS-R/dev-clean/251/137823/251_137823_000030_000000.wav,"An elderly woman with a bright, nasal voice."
64
+ LibriTTS-R/dev-clean/251/136532/251_136532_000002_000004.wav,"An adult male voice, dark, attractive, and authoritative, perfect for public presentations."
65
+ LibriTTS-R/dev-clean/3170/137482/3170_137482_000027_000005.wav,"A gender-ambiguous teenager with a cute, sweet voice."
66
+ LibriTTS-R/dev-clean/3170/137482/3170_137482_000003_000005.wav,"An adult female voice, authoritative and commanding, suited for roles in diplomacy and judiciary."
67
+ LibriTTS-R/dev-clean/3170/137482/3170_137482_000007_000000.wav,"A senior male voice, authoritative and commanding, perfect for public presentations."
68
+ LibriTTS-R/dev-clean/174/84280/174_84280_000016_000000.wav,"A senior man's voice, dark, rough, and authoritative for public presentations."
69
+ LibriTTS-R/dev-clean/174/50561/174_50561_000022_000000.wav,"An adult male voice that is dark, attractive, and warm."
70
+ LibriTTS-R/dev-clean/174/168635/174_168635_000025_000000.wav,"An adult male voice, bright and engaging, perfect for public presentations."
71
+ LibriTTS-R/dev-clean/3853/163249/3853_163249_000134_000000.wav,A bright and smooth teenage girl with an attractive voice.
72
+ LibriTTS-R/dev-clean/3853/163249/3853_163249_000088_000000.wav,"A middle-aged man with a deep, hoarse, and powerful voice."
73
+ LibriTTS-R/dev-clean/3853/163249/3853_163249_000077_000000.wav,"An adult voice with a gender-ambiguous tone, suitable for client and public interaction."
74
+ LibriTTS-R/dev-clean/1272/141231/1272_141231_000013_000001.wav,An older male voice with a dark and attractive tone.
75
+ LibriTTS-R/dev-clean/1272/141231/1272_141231_000027_000005.wav,A teenage boy with a feeble voice.
76
+ LibriTTS-R/dev-clean/1272/141231/1272_141231_000034_000003.wav,"A mature adult male voice, with a deep, attractive and alluring tone."
77
+ LibriTTS-R/dev-clean/6295/244435/6295_244435_000014_000000.wav,"A mature man's voice, deep and powerful, with an alluring quality, perfect for storytelling."
78
+ LibriTTS-R/dev-clean/6295/64301/6295_64301_000009_000003.wav,"A mature male voice with a deep and rich tone, ideal for captivating storytelling."
79
+ LibriTTS-R/dev-clean/6295/64301/6295_64301_000017_000000.wav,A mature female voice with a hoarse and husky quality.
80
+ LibriTTS-R/dev-clean/8297/275154/8297_275154_000011_000001.wav,"An elderly voice with a smooth, silky texture."
81
+ LibriTTS-R/dev-clean/8297/275154/8297_275154_000022_000011.wav,"An adult woman's voice that is bright, smooth, attractive, and warm, perfect for engaging storytelling."
82
+ LibriTTS-R/dev-clean/8297/275154/8297_275154_000024_000007.wav,"An adult male voice, dark and rough, exuding authority and perfect for public presentations."
83
+ LibriTTS-R/dev-clean/1462/170138/1462_170138_000019_000002.wav,"A confident and commanding adult female voice, with a smooth and authoritative tone."
84
+ LibriTTS-R/dev-clean/1462/170138/1462_170138_000003_000004.wav,"An adult male voice, deep and commanding with a sense of authority."
85
+ LibriTTS-R/dev-clean/1462/170142/1462_170142_000041_000001.wav,"A mature female voice, deep and authoritative."
86
+ LibriTTS-R/dev-clean/2277/149897/2277_149897_000023_000000.wav,"An adult male voice with a smooth, warm tone and a subtle nasal quality."
87
+ LibriTTS-R/dev-clean/2277/149896/2277_149896_000013_000000.wav,"An adult man's voice, dark and strong, with an attractive allure, ideal for storytelling."
88
+ LibriTTS-R/dev-clean/2277/149896/2277_149896_000025_000003.wav,"An adult man's voice, weak yet engaging for client and public interaction."
89
+ LibriTTS-R/dev-clean/8842/302201/8842_302201_000008_000005.wav,"A teenage boy's voice, nasal and weak, suitable for client and public interaction."
90
+ LibriTTS-R/dev-clean/8842/304647/8842_304647_000017_000001.wav,"An adult male voice, dark and rough, with an attractive charm suited for diplomacy and judiciary work."
91
+ LibriTTS-R/dev-clean/8842/302203/8842_302203_000020_000002.wav,"A mature woman's voice, deep and rich, ideal for political negotiation and legal discourse."
92
+ LibriTTS-R/dev-clean/5338/284437/5338_284437_000054_000002.wav,"A senior male voice, dark and smooth, with an attractive and alluring quality, perfect for storytelling narratives."
93
+ LibriTTS-R/dev-clean/5338/284437/5338_284437_000034_000000.wav,"An elderly female voice, rough and rugged."
94
+ LibriTTS-R/dev-clean/5338/284437/5338_284437_000046_000002.wav,"An older male voice, deep and powerful."
95
+ LibriTTS-R/dev-clean/3576/138058/3576_138058_000051_000000.wav,"An adult female voice, authoritative and commanding, suited for political negotiation and legal discourse."
96
+ LibriTTS-R/dev-clean/3576/138058/3576_138058_000024_000001.wav,"A young male voice, feeble and faint."
97
+ LibriTTS-R/dev-clean/3576/138058/3576_138058_000042_000000.wav,"An elderly male voice, dark, strong, and authoritative in tone."
98
+ LibriTTS-R/dev-clean/6345/93302/6345_93302_000063_000001.wav,"An adult male voice, dark and warm in tone."
99
+ LibriTTS-R/dev-clean/6345/93302/6345_93302_000069_000000.wav,"An adult male voice, dark and rough yet warm and inviting, perfect for captivating storytelling."
100
+ LibriTTS-R/dev-clean/6345/64257/6345_64257_000007_000003.wav,"An adult man's voice, dark, warm, and attractive, perfect for engaging storytelling."
101
+ LibriTTS-R/dev-clean/3000/15664/3000_15664_000006_000002.wav,"A senior male voice, dark and smooth, with an attractive tone, perfect for captivating storytelling."
102
+ LibriTTS-R/dev-clean/3000/15664/3000_15664_000040_000001.wav,"A mature male voice with a deep, husky and rough texture."
103
+ LibriTTS-R/dev-clean/3000/15664/3000_15664_000025_000000.wav,A mature and androgynous voice.
104
+ LibriTTS-R/dev-clean/1993/147966/1993_147966_000011_000003.wav,"An adult woman's voice, dark, smooth, and attractive, perfect for storytelling."
105
+ LibriTTS-R/dev-clean/1993/147965/1993_147965_000007_000001.wav,"An adult woman's voice, warm and inviting, perfect for creative storytelling."
106
+ LibriTTS-R/dev-clean/1993/147965/1993_147965_000002_000003.wav,"A teenager with a bright and lively voice, gender-ambiguous."
107
+ LibriTTS-R/dev-clean/3536/8226/3536_8226_000026_000012.wav,A mature female voice ideal for client and public interaction.
108
+ LibriTTS-R/dev-clean/3536/23268/3536_23268_000028_000000.wav,"An adult voice with a gender-ambiguous tone, bright and smooth."
109
+ LibriTTS-R/dev-clean/3536/8226/3536_8226_000026_000009.wav,"A mature male voice, ideal for delivering creative narratives through oral storytelling."
110
+ LibriTTS-R/dev-clean/5694/64029/5694_64029_000028_000001.wav,"A middle-aged man's attractive voice, perfect for captivating storytelling."
111
+ LibriTTS-R/dev-clean/5694/64038/5694_64038_000005_000000.wav,An elderly male voice that is authoritative and strong.
112
+ LibriTTS-R/dev-clean/5694/64025/5694_64025_000003_000000.wav,"An elderly male voice, authoritative and commanding in tone."
113
+ LibriTTS-R/dev-clean/6241/61943/6241_61943_000020_000000.wav,"An adult man's voice, dark and authoritative, ideal for diplomacy and judiciary roles."
114
+ LibriTTS-R/dev-clean/6241/61946/6241_61946_000043_000000.wav,A young boy's voice with a hoarse and husky tone.
115
+ LibriTTS-R/dev-clean/6241/61943/6241_61943_000039_000004.wav,"A mature, gender-neutral adult voice, smooth and perfect for storytelling."
116
+ LibriTTS-R/dev-clean/2078/142845/2078_142845_000018_000000.wav,"A teenage girl's sweet and charming voice, perfect for customer service and public engagement."
117
+ LibriTTS-R/dev-clean/2078/142845/2078_142845_000049_000000.wav,"An adult male voice that is dark, rough, attractive, and authoritative."
118
+ LibriTTS-R/dev-clean/2078/142845/2078_142845_000052_000000.wav,"An adult female voice, smooth and silky, perfect for customer service and public engagement roles."
119
+ LibriTTS-R/dev-clean/777/126732/777_126732_000076_000007.wav,"A senior male voice, characterized by a dark and weak tone."
120
+ LibriTTS-R/dev-clean/777/126732/777_126732_000076_000006.wav,"A senior man with a dark, authoritative, and strong voice, suited for diplomacy and judiciary professions."
121
+ LibriTTS-R/dev-clean/777/126732/777_126732_000076_000007.wav,"A mature male voice with a nasal, twangy quality."
dreamvoice/train_utils/src/configs/plugin.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+
3
+
4
+ class AttrDict(dict):
5
+ def __init__(self, *args, **kwargs):
6
+ super(AttrDict, self).__init__(*args, **kwargs)
7
+ self.__dict__ = self
8
+
9
+ def override(self, attrs):
10
+ if isinstance(attrs, dict):
11
+ self.__dict__.update(**attrs)
12
+ elif isinstance(attrs, (list, tuple, set)):
13
+ for attr in attrs:
14
+ self.override(attr)
15
+ elif attrs is not None:
16
+ raise NotImplementedError
17
+ return self
18
+
19
+
20
+ all_params = {
21
+ 'Plugin_base': AttrDict(
22
+ # Diff params
23
+ diff=AttrDict(
24
+ num_train_steps=1000,
25
+ beta_start=1e-4,
26
+ beta_end=0.02,
27
+ num_infer_steps=50,
28
+ v_prediction=True,
29
+ ),
30
+
31
+ text_encoder=AttrDict(
32
+ model='google/flan-t5-base'
33
+ ),
34
+ opt=AttrDict(
35
+ learning_rate=1e-4,
36
+ beta1=0.9,
37
+ beta2=0.999,
38
+ weight_decay=1e-4,
39
+ adam_epsilon=1e-08,
40
+ ),),
41
+ }
42
+
43
+ def get_params(name):
44
+ return all_params[name]
dreamvoice/train_utils/src/dataset/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ from .vcdata import VCData
dreamvoice/train_utils/src/dataset/dreamvc.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import random
4
+ import ast
5
+ import numpy as np
6
+ import torch
7
+ from einops import repeat, rearrange
8
+ import librosa
9
+
10
+ from torch.utils.data import Dataset
11
+ import torchaudio
12
+
13
+
14
+ class DreamData(Dataset):
15
+ def __init__(self, data_dir, meta_dir, subset, prompt_dir,):
16
+ self.datadir = data_dir
17
+ meta = pd.read_csv(meta_dir)
18
+ self.meta = meta[meta['subset'] == subset]
19
+ self.subset = subset
20
+ self.prompts = pd.read_csv(prompt_dir)
21
+
22
+ def __getitem__(self, index):
23
+ row = self.meta.iloc[index]
24
+
25
+ # get spk
26
+ spk_path = self.datadir + row['spk_path']
27
+ spk = torch.load(spk_path, map_location='cpu').squeeze(0)
28
+
29
+ speaker = row['speaker']
30
+
31
+ # get prompt
32
+ prompt = self.prompts[self.prompts['speaker_id'] == str(speaker)].sample(1)['prompt'].iloc[0]
33
+ return spk, prompt
34
+
35
+ def __len__(self):
36
+ return len(self.meta)
dreamvoice/train_utils/src/inference.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import soundfile as sf
4
+ import pandas as pd
5
+ from tqdm import tqdm
6
+ from utils import minmax_norm_diff, reverse_minmax_norm_diff
7
+ from spk_ext import se_extractor
8
+
9
+
10
+ def rescale_noise_cfg(noise_cfg, noise_pred_text, guidance_rescale=0.0):
11
+ """
12
+ Rescale `noise_cfg` according to `guidance_rescale`. Based on findings of [Common Diffusion Noise Schedules and
13
+ Sample Steps are Flawed](https://arxiv.org/pdf/2305.08891.pdf). See Section 3.4
14
+ """
15
+ std_text = noise_pred_text.std(dim=list(range(1, noise_pred_text.ndim)), keepdim=True)
16
+ std_cfg = noise_cfg.std(dim=list(range(1, noise_cfg.ndim)), keepdim=True)
17
+ # rescale the results from guidance (fixes overexposure)
18
+ noise_pred_rescaled = noise_cfg * (std_text / std_cfg)
19
+ # mix with the original results from guidance by factor guidance_rescale to avoid "plain looking" images
20
+ noise_cfg = guidance_rescale * noise_pred_rescaled + (1 - guidance_rescale) * noise_cfg
21
+ return noise_cfg
22
+
23
+
24
+ @torch.no_grad()
25
+ def inference_timbre(gen_shape, text,
26
+ model, scheduler,
27
+ guidance_scale=5, guidance_rescale=0.7,
28
+ ddim_steps=50, eta=1, random_seed=2023,
29
+ device='cuda',
30
+ ):
31
+ text, text_mask = text
32
+ model.eval()
33
+ generator = torch.Generator(device=device).manual_seed(random_seed)
34
+ scheduler.set_timesteps(ddim_steps)
35
+
36
+ # init noise
37
+ noise = torch.randn(gen_shape, generator=generator, device=device)
38
+ latents = noise
39
+
40
+ for t in scheduler.timesteps:
41
+ latents = scheduler.scale_model_input(latents, t)
42
+
43
+ if guidance_scale:
44
+ output_text = model(latents, t, text, text_mask, train_cfg=False)
45
+ output_uncond = model(latents, t, text, text_mask, train_cfg=True, cfg_prob=1.0)
46
+
47
+ output_pred = output_uncond + guidance_scale * (output_text - output_uncond)
48
+ if guidance_rescale > 0.0:
49
+ output_pred = rescale_noise_cfg(output_pred, output_text,
50
+ guidance_rescale=guidance_rescale)
51
+ else:
52
+ output_pred = model(latents, t, text, text_mask, train_cfg=False)
53
+
54
+ latents = scheduler.step(model_output=output_pred, timestep=t, sample=latents,
55
+ eta=eta, generator=generator).prev_sample
56
+
57
+ # pred = reverse_minmax_norm_diff(latents, vmin=0.0, vmax=0.5)
58
+ # pred = torch.clip(pred, min=0.0, max=0.5)
59
+ return latents
60
+
61
+
62
+ @torch.no_grad()
63
+ def eval_plugin_light(vc_model, text_model,
64
+ timbre_model, timbre_scheduler, timbre_shape,
65
+ val_meta, val_folder,
66
+ guidance_scale=3, guidance_rescale=0.7,
67
+ ddim_steps=50, eta=1, random_seed=2024,
68
+ device='cuda',
69
+ epoch=0, save_path='logs/eval/', val_num=10, sr=24000):
70
+
71
+ tokenizer, text_encoder = text_model
72
+
73
+ df = pd.read_csv(val_meta)
74
+
75
+ save_path = save_path + str(epoch) + '/'
76
+ os.makedirs(save_path, exist_ok=True)
77
+
78
+ step = 0
79
+
80
+ for i in range(len(df)):
81
+ row = df.iloc[i]
82
+
83
+ source_path = val_folder + row['path']
84
+ prompt = [row['prompt']]
85
+
86
+ with torch.no_grad():
87
+ text_batch = tokenizer(prompt,
88
+ max_length=32,
89
+ padding='max_length', truncation=True, return_tensors="pt")
90
+ text, text_mask = text_batch.input_ids.to(device), \
91
+ text_batch.attention_mask.to(device)
92
+ text = text_encoder(input_ids=text, attention_mask=text_mask)[0]
93
+
94
+ spk_embed = inference_timbre(timbre_shape, [text, text_mask],
95
+ timbre_model, timbre_scheduler,
96
+ guidance_scale=guidance_scale, guidance_rescale=guidance_rescale,
97
+ ddim_steps=ddim_steps, eta=eta, random_seed=random_seed,
98
+ device=device)
99
+
100
+ source_se = se_extractor(source_path, vc_model).to(device)
101
+ # print(source_se.shape)
102
+ # print(spk_embed.shape)
103
+
104
+ encode_message = "@MyShell"
105
+ vc_model.convert(
106
+ audio_src_path=source_path,
107
+ src_se=source_se,
108
+ tgt_se=spk_embed,
109
+ output_path=save_path + f'{step}_{prompt[0]}' + '.wav',
110
+ message=encode_message)
111
+
112
+ step += 1
113
+ if step >= val_num:
114
+ break
dreamvoice/train_utils/src/model/p2e_cross.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from diffusers import UNet2DModel, UNet2DConditionModel
4
+ import yaml
5
+ from einops import repeat, rearrange
6
+
7
+ from typing import Any
8
+ from torch import Tensor
9
+
10
+
11
+ def rand_bool(shape: Any, proba: float, device: Any = None) -> Tensor:
12
+ if proba == 1:
13
+ return torch.ones(shape, device=device, dtype=torch.bool)
14
+ elif proba == 0:
15
+ return torch.zeros(shape, device=device, dtype=torch.bool)
16
+ else:
17
+ return torch.bernoulli(torch.full(shape, proba, device=device)).to(torch.bool)
18
+
19
+
20
+ class FixedEmbedding(nn.Module):
21
+ def __init__(self, features=128):
22
+ super().__init__()
23
+ self.embedding = nn.Embedding(1, features)
24
+
25
+ def forward(self, y):
26
+ B, L, C, device = y.shape[0], y.shape[-2], y.shape[-1], y.device
27
+ embed = self.embedding(torch.zeros(B, device=device).long())
28
+ fixed_embedding = repeat(embed, "b c -> b l c", l=L)
29
+ return fixed_embedding
30
+
31
+
32
+ class P2E_Cross(nn.Module):
33
+ def __init__(self, config):
34
+ super().__init__()
35
+ self.config = config
36
+ self.unet = UNet2DConditionModel(**self.config['unet'])
37
+ self.unet.set_use_memory_efficient_attention_xformers(True)
38
+ self.cfg_embedding = FixedEmbedding(self.config['unet']['cross_attention_dim'])
39
+
40
+ self.context_embedding = nn.Sequential(
41
+ nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']),
42
+ nn.SiLU(),
43
+ nn.Linear(self.config['unet']['cross_attention_dim'], self.config['unet']['cross_attention_dim']))
44
+
45
+ def forward(self, target, t, prompt, prompt_mask=None,
46
+ train_cfg=False, cfg_prob=0.0):
47
+ target = target.unsqueeze(-1)
48
+ B, C, _, _ = target.shape
49
+
50
+ if train_cfg:
51
+ if cfg_prob > 0.0:
52
+ # Randomly mask embedding
53
+ batch_mask = rand_bool(shape=(B, 1, 1), proba=cfg_prob, device=target.device)
54
+ fixed_embedding = self.cfg_embedding(prompt).to(target.dtype)
55
+ prompt = torch.where(batch_mask, fixed_embedding, prompt)
56
+
57
+ prompt = self.context_embedding(prompt)
58
+ # fix the bug that prompt will copy dtype from target in diffusers
59
+ target = target.to(prompt.dtype)
60
+
61
+ output = self.unet(sample=target, timestep=t,
62
+ encoder_hidden_states=prompt,
63
+ encoder_attention_mask=prompt_mask)['sample']
64
+
65
+ return output.squeeze(-1)
66
+
67
+
68
+ if __name__ == "__main__":
69
+ with open('p2e_cross.yaml', 'r') as fp:
70
+ config = yaml.safe_load(fp)
71
+ device = 'cuda'
72
+
73
+ model = P2E_Cross(config['diffwrap']).to(device)
74
+
75
+ x = torch.rand((2, 256)).to(device)
76
+ t = torch.randint(0, 1000, (2,)).long().to(device)
77
+ prompt = torch.rand(2, 64, 768).to(device)
78
+ prompt_mask = torch.ones(2, 64).to(device)
79
+
80
+ output = model(x, t, prompt, prompt_mask, train_cfg=True, cfg_prob=0.25)
dreamvoice/train_utils/src/model/p2e_cross.yaml ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 1.0
2
+
3
+ system: "cross"
4
+
5
+ diffwrap:
6
+ cls_embedding:
7
+ content_dim: 768
8
+ content_hidden: 256
9
+
10
+ unet:
11
+ sample_size: [1, 1]
12
+ in_channels: 256
13
+ out_channels: 256
14
+ layers_per_block: 2
15
+ block_out_channels: [256]
16
+ down_block_types:
17
+ [
18
+ "CrossAttnDownBlock2D",
19
+ ]
20
+ up_block_types:
21
+ [
22
+ "CrossAttnUpBlock2D",
23
+ ]
24
+ attention_head_dim: 32
25
+ cross_attention_dim: 768
26
+
dreamvoice/train_utils/src/modules/speaker_encoder/LICENSE ADDED
@@ -0,0 +1,24 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Modified & original work Copyright (c) 2019 Corentin Jemine (https://github.com/CorentinJ)
4
+ Original work Copyright (c) 2018 Rayhane Mama (https://github.com/Rayhane-mamah)
5
+ Original work Copyright (c) 2019 fatchord (https://github.com/fatchord)
6
+ Original work Copyright (c) 2015 braindead (https://github.com/braindead)
7
+
8
+ Permission is hereby granted, free of charge, to any person obtaining a copy
9
+ of this software and associated documentation files (the "Software"), to deal
10
+ in the Software without restriction, including without limitation the rights
11
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12
+ copies of the Software, and to permit persons to whom the Software is
13
+ furnished to do so, subject to the following conditions:
14
+
15
+ The above copyright notice and this permission notice shall be included in all
16
+ copies or substantial portions of the Software.
17
+
18
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
24
+ SOFTWARE.
dreamvoice/train_utils/src/modules/speaker_encoder/README.md ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Real-Time Voice Cloning
2
+ This repository is an implementation of [Transfer Learning from Speaker Verification to
3
+ Multispeaker Text-To-Speech Synthesis](https://arxiv.org/pdf/1806.04558.pdf) (SV2TTS) with a vocoder that works in real-time. This was my [master's thesis](https://matheo.uliege.be/handle/2268.2/6801).
4
+
5
+ SV2TTS is a deep learning framework in three stages. In the first stage, one creates a digital representation of a voice from a few seconds of audio. In the second and third stages, this representation is used as reference to generate speech given arbitrary text.
6
+
7
+ **Video demonstration** (click the picture):
8
+
9
+ [![Toolbox demo](https://i.imgur.com/8lFUlgz.png)](https://www.youtube.com/watch?v=-O_hYhToKoA)
10
+
11
+
12
+
13
+ ### Papers implemented
14
+ | URL | Designation | Title | Implementation source |
15
+ | --- | ----------- | ----- | --------------------- |
16
+ |[**1806.04558**](https://arxiv.org/pdf/1806.04558.pdf) | **SV2TTS** | **Transfer Learning from Speaker Verification to Multispeaker Text-To-Speech Synthesis** | This repo |
17
+ |[1802.08435](https://arxiv.org/pdf/1802.08435.pdf) | WaveRNN (vocoder) | Efficient Neural Audio Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN) |
18
+ |[1703.10135](https://arxiv.org/pdf/1703.10135.pdf) | Tacotron (synthesizer) | Tacotron: Towards End-to-End Speech Synthesis | [fatchord/WaveRNN](https://github.com/fatchord/WaveRNN)
19
+ |[1710.10467](https://arxiv.org/pdf/1710.10467.pdf) | GE2E (encoder)| Generalized End-To-End Loss for Speaker Verification | This repo |
20
+
21
+ ## News
22
+ **10/01/22**: I recommend checking out [CoquiTTS](https://github.com/coqui-ai/tts). It's a good and up-to-date TTS repository targeted for the ML community. It can also do voice cloning and more, such as cross-language cloning or voice conversion.
23
+
24
+ **28/12/21**: I've done a [major maintenance update](https://github.com/CorentinJ/Real-Time-Voice-Cloning/pull/961). Mostly, I've worked on making setup easier. Find new instructions in the section below.
25
+
26
+ **14/02/21**: This repo now runs on PyTorch instead of Tensorflow, thanks to the help of @bluefish.
27
+
28
+ **13/11/19**: I'm now working full time and I will rarely maintain this repo anymore. To anyone who reads this:
29
+ - **If you just want to clone your voice (and not someone else's):** I recommend our free plan on [Resemble.AI](https://www.resemble.ai/). You will get a better voice quality and less prosody errors.
30
+ - **If this is not your case:** proceed with this repository, but you might end up being disappointed by the results. If you're planning to work on a serious project, my strong advice: find another TTS repo. Go [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/364) for more info.
31
+
32
+ **20/08/19:** I'm working on [resemblyzer](https://github.com/resemble-ai/Resemblyzer), an independent package for the voice encoder (inference only). You can use your trained encoder models from this repo with it.
33
+
34
+
35
+ ## Setup
36
+
37
+ ### 1. Install Requirements
38
+ 1. Both Windows and Linux are supported. A GPU is recommended for training and for inference speed, but is not mandatory.
39
+ 2. Python 3.7 is recommended. Python 3.5 or greater should work, but you'll probably have to tweak the dependencies' versions. I recommend setting up a virtual environment using `venv`, but this is optional.
40
+ 3. Install [ffmpeg](https://ffmpeg.org/download.html#get-packages). This is necessary for reading audio files.
41
+ 4. Install [PyTorch](https://pytorch.org/get-started/locally/). Pick the latest stable version, your operating system, your package manager (pip by default) and finally pick any of the proposed CUDA versions if you have a GPU, otherwise pick CPU. Run the given command.
42
+ 5. Install the remaining requirements with `pip install -r requirements.txt`
43
+
44
+ ### 2. (Optional) Download Pretrained Models
45
+ Pretrained models are now downloaded automatically. If this doesn't work for you, you can manually download them [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Pretrained-models).
46
+
47
+ ### 3. (Optional) Test Configuration
48
+ Before you download any dataset, you can begin by testing your configuration with:
49
+
50
+ `python demo_cli.py`
51
+
52
+ If all tests pass, you're good to go.
53
+
54
+ ### 4. (Optional) Download Datasets
55
+ For playing with the toolbox alone, I only recommend downloading [`LibriSpeech/train-clean-100`](https://www.openslr.org/resources/12/train-clean-100.tar.gz). Extract the contents as `<datasets_root>/LibriSpeech/train-clean-100` where `<datasets_root>` is a directory of your choosing. Other datasets are supported in the toolbox, see [here](https://github.com/CorentinJ/Real-Time-Voice-Cloning/wiki/Training#datasets). You're free not to download any dataset, but then you will need your own data as audio files or you will have to record it with the toolbox.
56
+
57
+ ### 5. Launch the Toolbox
58
+ You can then try the toolbox:
59
+
60
+ `python demo_toolbox.py -d <datasets_root>`
61
+ or
62
+ `python demo_toolbox.py`
63
+
64
+ depending on whether you downloaded any datasets. If you are running an X-server or if you have the error `Aborted (core dumped)`, see [this issue](https://github.com/CorentinJ/Real-Time-Voice-Cloning/issues/11#issuecomment-504733590).
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/audio.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ from scipy.ndimage.morphology import binary_dilation
4
+ from .params_data import *
5
+ from pathlib import Path
6
+ from typing import Optional, Union
7
+ import numpy as np
8
+ import webrtcvad
9
+ import librosa
10
+ import struct
11
+
12
+ import torch
13
+ from torchaudio.transforms import Resample
14
+ from librosa.filters import mel as librosa_mel_fn
15
+
16
+
17
+ int16_max = (2 ** 15) - 1
18
+
19
+
20
+ def preprocess_wav(fpath_or_wav: Union[str, Path, np.ndarray],
21
+ source_sr: Optional[int] = None):
22
+ """
23
+ Applies the preprocessing operations used in training the Speaker Encoder to a waveform
24
+ either on disk or in memory. The waveform will be resampled to match the data hyperparameters.
25
+
26
+ :param fpath_or_wav: either a filepath to an audio file (many extensions are supported, not
27
+ just .wav), either the waveform as a numpy array of floats.
28
+ :param source_sr: if passing an audio waveform, the sampling rate of the waveform before
29
+ preprocessing. After preprocessing, the waveform's sampling rate will match the data
30
+ hyperparameters. If passing a filepath, the sampling rate will be automatically detected and
31
+ this argument will be ignored.
32
+ """
33
+ # Load the wav from disk if needed
34
+ if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
35
+ wav, source_sr = librosa.load(fpath_or_wav, sr=None)
36
+ else:
37
+ wav = fpath_or_wav
38
+
39
+ # Resample the wav if needed
40
+ if source_sr is not None and source_sr != sampling_rate:
41
+ wav = librosa.resample(wav, orig_sr=source_sr, target_sr=sampling_rate)
42
+
43
+ # Apply the preprocessing: normalize volume and shorten long silences
44
+ wav = normalize_volume(wav, audio_norm_target_dBFS, increase_only=True)
45
+ wav = trim_long_silences(wav)
46
+
47
+ return wav
48
+
49
+
50
+ def preprocess_wav_batch(wavs, source_sr=22050):
51
+ # This torch version is designed to cope with a batch of same lengths wavs
52
+ if sampling_rate != source_sr:
53
+ resample = Resample(source_sr, sampling_rate)
54
+ wavs = resample(wavs)
55
+ wavs_preprocessed = normalize_volume_batch(wavs, audio_norm_target_dBFS,
56
+ increase_only=True)
57
+ # Trimming silence is not implemented in this version yet!
58
+ return wavs_preprocessed
59
+
60
+
61
+ def wav_to_mel_spectrogram(wav):
62
+ """
63
+ Derives a mel spectrogram ready to be used by the encoder from a preprocessed audio waveform.
64
+ Note: this not a log-mel spectrogram.
65
+ """
66
+ frames = librosa.feature.melspectrogram(
67
+ y=wav,
68
+ sr=sampling_rate,
69
+ n_fft=int(sampling_rate * mel_window_length / 1000),
70
+ hop_length=int(sampling_rate * mel_window_step / 1000),
71
+ n_mels=mel_n_channels
72
+ )
73
+ return frames.astype(np.float32).T
74
+
75
+
76
+ def wav_to_mel_spectrogram_batch(wavs):
77
+ # This torch version is designed to cope with a batch of same lengths wavs
78
+ n_fft = int(sampling_rate * mel_window_length / 1000)
79
+ hop_length = int(sampling_rate * mel_window_step / 1000)
80
+ win_length = int(sampling_rate * mel_window_length / 1000)
81
+ window = torch.hann_window(n_fft).to(wavs)
82
+ mel_basis = torch.from_numpy(librosa_mel_fn(sr=sampling_rate, n_fft=n_fft,
83
+ n_mels=mel_n_channels)).to(wavs)
84
+ s = torch.stft(wavs, n_fft=n_fft, hop_length=hop_length,
85
+ win_length=win_length, window=window, center=True, return_complex=False)
86
+ real_part, imag_part = s.unbind(-1)
87
+ stftm = real_part**2 + imag_part**2
88
+ mels = torch.matmul(mel_basis, stftm)
89
+ return torch.transpose(mels, 1, 2)
90
+
91
+
92
+ def normalize_volume(wav, target_dBFS, increase_only=False, decrease_only=False):
93
+ if increase_only and decrease_only:
94
+ raise ValueError("Both increase only and decrease only are set")
95
+ dBFS_change = target_dBFS - 10 * np.log10(np.mean(wav ** 2))
96
+ if (dBFS_change < 0 and increase_only) or (dBFS_change > 0 and decrease_only):
97
+ return wav
98
+ return wav * (10 ** (dBFS_change / 20))
99
+
100
+
101
+ def normalize_volume_batch(wavs, target_dBFS, increase_only=False, decrease_only=False):
102
+ # This torch version is designed to cope with a batch of same lengths wavs
103
+ if increase_only and decrease_only:
104
+ raise ValueError("Both increase only and decrease only are set")
105
+ dBFS_change = target_dBFS - 10 * torch.log10(torch.mean(wavs ** 2, axis=-1))
106
+ scales = torch.ones(wavs.shape[0], device=wavs.device, dtype=wavs.dtype)
107
+ if increase_only:
108
+ mask = (dBFS_change > 0).to(scales)
109
+ elif decrease_only:
110
+ mask = (dBFS_change < 0).to(scales)
111
+ else:
112
+ mask = torch.zeros_like(scales)
113
+ scales = scales + mask * (10 ** (dBFS_change / 20) - 1.0)
114
+ return wavs * scales.unsqueeze(-1)
115
+
116
+
117
+ def trim_long_silences(wav):
118
+ """
119
+ Ensures that segments without voice in the waveform remain no longer than a
120
+ threshold determined by the VAD parameters in params.py.
121
+
122
+ :param wav: the raw waveform as a numpy array of floats
123
+ :return: the same waveform with silences trimmed away (length <= original wav length)
124
+ """
125
+ # Compute the voice detection window size
126
+ samples_per_window = (vad_window_length * sampling_rate) // 1000
127
+
128
+ # Trim the end of the audio to have a multiple of the window size
129
+ wav = wav[:len(wav) - (len(wav) % samples_per_window)]
130
+
131
+ # Convert the float waveform to 16-bit mono PCM
132
+ pcm_wave = struct.pack("%dh" % len(wav), *(np.round(wav * int16_max)).astype(np.int16))
133
+
134
+ # Perform voice activation detection
135
+ voice_flags = []
136
+ vad = webrtcvad.Vad(mode=3)
137
+ for window_start in range(0, len(wav), samples_per_window):
138
+ window_end = window_start + samples_per_window
139
+ voice_flags.append(vad.is_speech(pcm_wave[window_start * 2:window_end * 2],
140
+ sample_rate=sampling_rate))
141
+ voice_flags = np.array(voice_flags)
142
+
143
+ # Smooth the voice detection with a moving average
144
+ def moving_average(array, width):
145
+ array_padded = np.concatenate((np.zeros((width - 1) // 2), array, np.zeros(width // 2)))
146
+ ret = np.cumsum(array_padded, dtype=float)
147
+ ret[width:] = ret[width:] - ret[:-width]
148
+ return ret[width - 1:] / width
149
+
150
+ audio_mask = moving_average(voice_flags, vad_moving_average_width)
151
+ audio_mask = np.round(audio_mask).astype(np.bool)
152
+
153
+ # Dilate the voiced regions
154
+ audio_mask = binary_dilation(audio_mask, np.ones(vad_max_silence_length + 1))
155
+ audio_mask = np.repeat(audio_mask, samples_per_window)
156
+
157
+ return wav[audio_mask == True]
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/config.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ librispeech_datasets = {
4
+ "train": {
5
+ "clean": ["LibriSpeech/train-clean-100", "LibriSpeech/train-clean-360"],
6
+ "other": ["LibriSpeech/train-other-500"]
7
+ },
8
+ "test": {
9
+ "clean": ["LibriSpeech/test-clean"],
10
+ "other": ["LibriSpeech/test-other"]
11
+ },
12
+ "dev": {
13
+ "clean": ["LibriSpeech/dev-clean"],
14
+ "other": ["LibriSpeech/dev-other"]
15
+ },
16
+ }
17
+ libritts_datasets = {
18
+ "train": {
19
+ "clean": ["LibriTTS/train-clean-100", "LibriTTS/train-clean-360"],
20
+ "other": ["LibriTTS/train-other-500"]
21
+ },
22
+ "test": {
23
+ "clean": ["LibriTTS/test-clean"],
24
+ "other": ["LibriTTS/test-other"]
25
+ },
26
+ "dev": {
27
+ "clean": ["LibriTTS/dev-clean"],
28
+ "other": ["LibriTTS/dev-other"]
29
+ },
30
+ }
31
+ voxceleb_datasets = {
32
+ "voxceleb1" : {
33
+ "train": ["VoxCeleb1/wav"],
34
+ "test": ["VoxCeleb1/test_wav"]
35
+ },
36
+ "voxceleb2" : {
37
+ "train": ["VoxCeleb2/dev/aac"],
38
+ "test": ["VoxCeleb2/test_wav"]
39
+ }
40
+ }
41
+
42
+ other_datasets = [
43
+ "LJSpeech-1.1",
44
+ "VCTK-Corpus/wav48",
45
+ ]
46
+
47
+ anglophone_nationalites = ["australia", "canada", "ireland", "uk", "usa"]
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ from .speaker_verification_dataset import SpeakerVerificationDataset
4
+ from .speaker_verification_dataset import SpeakerVerificationDataLoader
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/random_cycler.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ import random
4
+
5
+ class RandomCycler:
6
+ """
7
+ Creates an internal copy of a sequence and allows access to its items in a constrained random
8
+ order. For a source sequence of n items and one or several consecutive queries of a total
9
+ of m items, the following guarantees hold (one implies the other):
10
+ - Each item will be returned between m // n and ((m - 1) // n) + 1 times.
11
+ - Between two appearances of the same item, there may be at most 2 * (n - 1) other items.
12
+ """
13
+
14
+ def __init__(self, source):
15
+ if len(source) == 0:
16
+ raise Exception("Can't create RandomCycler from an empty collection")
17
+ self.all_items = list(source)
18
+ self.next_items = []
19
+
20
+ def sample(self, count: int):
21
+ shuffle = lambda l: random.sample(l, len(l))
22
+
23
+ out = []
24
+ while count > 0:
25
+ if count >= len(self.all_items):
26
+ out.extend(shuffle(list(self.all_items)))
27
+ count -= len(self.all_items)
28
+ continue
29
+ n = min(count, len(self.next_items))
30
+ out.extend(self.next_items[:n])
31
+ count -= n
32
+ self.next_items = self.next_items[n:]
33
+ if len(self.next_items) == 0:
34
+ self.next_items = shuffle(list(self.all_items))
35
+ return out
36
+
37
+ def __next__(self):
38
+ return self.sample(1)[0]
39
+
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/speaker.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ from .random_cycler import RandomCycler
4
+ from .utterance import Utterance
5
+ from pathlib import Path
6
+
7
+ # Contains the set of utterances of a single speaker
8
+ class Speaker:
9
+ def __init__(self, root: Path):
10
+ self.root = root
11
+ self.name = root.name
12
+ self.utterances = None
13
+ self.utterance_cycler = None
14
+
15
+ def _load_utterances(self):
16
+ with self.root.joinpath("_sources.txt").open("r") as sources_file:
17
+ sources = [l.split(",") for l in sources_file]
18
+ sources = {frames_fname: wave_fpath for frames_fname, wave_fpath in sources}
19
+ self.utterances = [Utterance(self.root.joinpath(f), w) for f, w in sources.items()]
20
+ self.utterance_cycler = RandomCycler(self.utterances)
21
+
22
+ def random_partial(self, count, n_frames):
23
+ """
24
+ Samples a batch of <count> unique partial utterances from the disk in a way that all
25
+ utterances come up at least once every two cycles and in a random order every time.
26
+
27
+ :param count: The number of partial utterances to sample from the set of utterances from
28
+ that speaker. Utterances are guaranteed not to be repeated if <count> is not larger than
29
+ the number of utterances available.
30
+ :param n_frames: The number of frames in the partial utterance.
31
+ :return: A list of tuples (utterance, frames, range) where utterance is an Utterance,
32
+ frames are the frames of the partial utterances and range is the range of the partial
33
+ utterance with regard to the complete utterance.
34
+ """
35
+ if self.utterances is None:
36
+ self._load_utterances()
37
+
38
+ utterances = self.utterance_cycler.sample(count)
39
+
40
+ a = [(u,) + u.random_partial(n_frames) for u in utterances]
41
+
42
+ return a
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/speaker_batch.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ import numpy as np
4
+ from typing import List
5
+ from .speaker import Speaker
6
+
7
+ class SpeakerBatch:
8
+ def __init__(self, speakers: List[Speaker], utterances_per_speaker: int, n_frames: int):
9
+ self.speakers = speakers
10
+ self.partials = {s: s.random_partial(utterances_per_speaker, n_frames) for s in speakers}
11
+
12
+ # Array of shape (n_speakers * n_utterances, n_frames, mel_n), e.g. for 3 speakers with
13
+ # 4 utterances each of 160 frames of 40 mel coefficients: (12, 160, 40)
14
+ self.data = np.array([frames for s in speakers for _, frames, _ in self.partials[s]])
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/speaker_verification_dataset.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ from .random_cycler import RandomCycler
4
+ from .speaker_batch import SpeakerBatch
5
+ from .speaker import Speaker
6
+ from ..params_data import partials_n_frames
7
+ from torch.utils.data import Dataset, DataLoader
8
+ from pathlib import Path
9
+
10
+ # TODO: improve with a pool of speakers for data efficiency
11
+
12
+ class SpeakerVerificationDataset(Dataset):
13
+ def __init__(self, datasets_root: Path):
14
+ self.root = datasets_root
15
+ speaker_dirs = [f for f in self.root.glob("*") if f.is_dir()]
16
+ if len(speaker_dirs) == 0:
17
+ raise Exception("No speakers found. Make sure you are pointing to the directory "
18
+ "containing all preprocessed speaker directories.")
19
+ self.speakers = [Speaker(speaker_dir) for speaker_dir in speaker_dirs]
20
+ self.speaker_cycler = RandomCycler(self.speakers)
21
+
22
+ def __len__(self):
23
+ return int(1e10)
24
+
25
+ def __getitem__(self, index):
26
+ return next(self.speaker_cycler)
27
+
28
+ def get_logs(self):
29
+ log_string = ""
30
+ for log_fpath in self.root.glob("*.txt"):
31
+ with log_fpath.open("r") as log_file:
32
+ log_string += "".join(log_file.readlines())
33
+ return log_string
34
+
35
+
36
+ class SpeakerVerificationDataLoader(DataLoader):
37
+ def __init__(self, dataset, speakers_per_batch, utterances_per_speaker, sampler=None,
38
+ batch_sampler=None, num_workers=0, pin_memory=False, timeout=0,
39
+ worker_init_fn=None):
40
+ self.utterances_per_speaker = utterances_per_speaker
41
+
42
+ super().__init__(
43
+ dataset=dataset,
44
+ batch_size=speakers_per_batch,
45
+ shuffle=False,
46
+ sampler=sampler,
47
+ batch_sampler=batch_sampler,
48
+ num_workers=num_workers,
49
+ collate_fn=self.collate,
50
+ pin_memory=pin_memory,
51
+ drop_last=False,
52
+ timeout=timeout,
53
+ worker_init_fn=worker_init_fn
54
+ )
55
+
56
+ def collate(self, speakers):
57
+ return SpeakerBatch(speakers, self.utterances_per_speaker, partials_n_frames)
58
+
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/data_objects/utterance.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ import numpy as np
4
+
5
+
6
+ class Utterance:
7
+ def __init__(self, frames_fpath, wave_fpath):
8
+ self.frames_fpath = frames_fpath
9
+ self.wave_fpath = wave_fpath
10
+
11
+ def get_frames(self):
12
+ return np.load(self.frames_fpath)
13
+
14
+ def random_partial(self, n_frames):
15
+ """
16
+ Crops the frames into a partial utterance of n_frames
17
+
18
+ :param n_frames: The number of frames of the partial utterance
19
+ :return: the partial utterance frames and a tuple indicating the start and end of the
20
+ partial utterance in the complete utterance.
21
+ """
22
+ frames = self.get_frames()
23
+ if frames.shape[0] == n_frames:
24
+ start = 0
25
+ else:
26
+ start = np.random.randint(0, frames.shape[0] - n_frames)
27
+ end = start + n_frames
28
+ return frames[start:end], (start, end)
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/inference.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ from .params_data import *
4
+ from .model import SpeakerEncoder
5
+ from .audio import preprocess_wav, preprocess_wav_batch, wav_to_mel_spectrogram_batch, wav_to_mel_spectrogram
6
+ from matplotlib import cm
7
+ from pathlib import Path
8
+ import matplotlib.pyplot as plt
9
+ import numpy as np
10
+ import torch
11
+
12
+ _model = None # type: SpeakerEncoder
13
+ _device = None # type: torch.device
14
+
15
+
16
+ def load_model(weights_fpath: Path, device="cpu"):
17
+ """
18
+ Loads the model in memory. If this function is not explicitely called, it will be run on the
19
+ first call to embed_frames() with the default weights file.
20
+
21
+ :param weights_fpath: the path to saved model weights.
22
+ :param device: either a torch device or the name of a torch device (e.g. "cpu", "cuda"). The
23
+ model will be loaded and will run on this device. Outputs will however always be on the cpu.
24
+ If None, will default to your GPU if it"s available, otherwise your CPU.
25
+ """
26
+ # TODO: I think the slow loading of the encoder might have something to do with the device it
27
+ # was saved on. Worth investigating.
28
+ global _model, _device
29
+ if device is None:
30
+ _device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
31
+ elif isinstance(device, str):
32
+ _device = torch.device(device)
33
+ _model = SpeakerEncoder(_device, torch.device("cpu"))
34
+ checkpoint = torch.load(weights_fpath, map_location="cpu")
35
+ _model.load_state_dict(checkpoint["model_state"])
36
+ _model.eval()
37
+ _model = _model.to(device)
38
+ print("Loaded encoder \"%s\" trained to step %d" % (weights_fpath.name, checkpoint["step"]))
39
+
40
+
41
+ def is_loaded():
42
+ return _model is not None
43
+
44
+
45
+ @torch.no_grad()
46
+ def embed_frames_batch(frames, use_torch=False):
47
+ if _model is None:
48
+ raise Exception("Model was not loaded. Call load_model() before inference.")
49
+
50
+ if not use_torch:
51
+ frames = torch.from_numpy(frames)
52
+ frames = frames.to(_device)
53
+
54
+ embeds = _model.forward(frames)
55
+ if not use_torch:
56
+ embeds = embeds.detach().cpu().numpy()
57
+ return embeds
58
+
59
+
60
+ def compute_partial_slices(n_samples, partial_utterance_n_frames=partials_n_frames,
61
+ min_pad_coverage=0.75, overlap=0.5):
62
+ """
63
+ Computes where to split an utterance waveform and its corresponding mel spectrogram to obtain
64
+ partial utterances of <partial_utterance_n_frames> each. Both the waveform and the mel
65
+ spectrogram slices are returned, so as to make each partial utterance waveform correspond to
66
+ its spectrogram. This function assumes that the mel spectrogram parameters used are those
67
+ defined in params_data.py.
68
+
69
+ The returned ranges may be indexing further than the length of the waveform. It is
70
+ recommended that you pad the waveform with zeros up to wave_slices[-1].stop.
71
+
72
+ :param n_samples: the number of samples in the waveform
73
+ :param partial_utterance_n_frames: the number of mel spectrogram frames in each partial
74
+ utterance
75
+ :param min_pad_coverage: when reaching the last partial utterance, it may or may not have
76
+ enough frames. If at least <min_pad_coverage> of <partial_utterance_n_frames> are present,
77
+ then the last partial utterance will be considered, as if we padded the audio. Otherwise,
78
+ it will be discarded, as if we trimmed the audio. If there aren't enough frames for 1 partial
79
+ utterance, this parameter is ignored so that the function always returns at least 1 slice.
80
+ :param overlap: by how much the partial utterance should overlap. If set to 0, the partial
81
+ utterances are entirely disjoint.
82
+ :return: the waveform slices and mel spectrogram slices as lists of array slices. Index
83
+ respectively the waveform and the mel spectrogram with these slices to obtain the partial
84
+ utterances.
85
+ """
86
+ assert 0 <= overlap < 1
87
+ assert 0 < min_pad_coverage <= 1
88
+
89
+ samples_per_frame = int((sampling_rate * mel_window_step / 1000))
90
+ n_frames = int(np.ceil((n_samples + 1) / samples_per_frame))
91
+ frame_step = max(int(np.round(partial_utterance_n_frames * (1 - overlap))), 1)
92
+
93
+ # Compute the slices
94
+ wav_slices, mel_slices = [], []
95
+ steps = max(1, n_frames - partial_utterance_n_frames + frame_step + 1)
96
+ for i in range(0, steps, frame_step):
97
+ mel_range = np.array([i, i + partial_utterance_n_frames])
98
+ wav_range = mel_range * samples_per_frame
99
+ mel_slices.append(slice(*mel_range))
100
+ wav_slices.append(slice(*wav_range))
101
+
102
+ # Evaluate whether extra padding is warranted or not
103
+ last_wav_range = wav_slices[-1]
104
+ coverage = (n_samples - last_wav_range.start) / (last_wav_range.stop - last_wav_range.start)
105
+ if coverage < min_pad_coverage and len(mel_slices) > 1:
106
+ mel_slices = mel_slices[:-1]
107
+ wav_slices = wav_slices[:-1]
108
+
109
+ return wav_slices, mel_slices
110
+
111
+
112
+ @torch.no_grad()
113
+ def embed_utterance(wav, using_partials=True, return_partials=False, **kwargs):
114
+ """
115
+ Computes an embedding for a single utterance.
116
+
117
+ # TODO: handle multiple wavs to benefit from batching on GPU
118
+ :param wav: a preprocessed (see audio.py) utterance waveform as a numpy array of float32
119
+ :param using_partials: if True, then the utterance is split in partial utterances of
120
+ <partial_utterance_n_frames> frames and the utterance embedding is computed from their
121
+ normalized average. If False, the utterance is instead computed from feeding the entire
122
+ spectogram to the network.
123
+ :param return_partials: if True, the partial embeddings will also be returned along with the
124
+ wav slices that correspond to the partial embeddings.
125
+ :param kwargs: additional arguments to compute_partial_splits()
126
+ :return: the embedding as a numpy array of float32 of shape (model_embedding_size,). If
127
+ <return_partials> is True, the partial utterances as a numpy array of float32 of shape
128
+ (n_partials, model_embedding_size) and the wav partials as a list of slices will also be
129
+ returned. If <using_partials> is simultaneously set to False, both these values will be None
130
+ instead.
131
+ """
132
+ # Process the entire utterance if not using partials
133
+ if not using_partials:
134
+ frames = wav_to_mel_spectrogram(wav)
135
+ embed = embed_frames_batch(frames[None, ...])[0]
136
+ if return_partials:
137
+ return embed, None, None
138
+ return embed
139
+
140
+ # Compute where to split the utterance into partials and pad if necessary
141
+ wave_slices, mel_slices = compute_partial_slices(len(wav), **kwargs)
142
+ max_wave_length = wave_slices[-1].stop
143
+ if max_wave_length >= len(wav):
144
+ wav = np.pad(wav, (0, max_wave_length - len(wav)), "constant")
145
+
146
+ # Split the utterance into partials
147
+ frames = wav_to_mel_spectrogram(wav)
148
+ frames_batch = np.array([frames[s] for s in mel_slices])
149
+ partial_embeds = embed_frames_batch(frames_batch)
150
+
151
+ # Compute the utterance embedding from the partial embeddings
152
+ raw_embed = np.mean(partial_embeds, axis=0)
153
+ embed = raw_embed / np.linalg.norm(raw_embed, 2)
154
+
155
+ if return_partials:
156
+ return embed, partial_embeds, wave_slices
157
+ return embed
158
+
159
+
160
+ @torch.no_grad()
161
+ def embed_utterance_batch(wavs, using_partials=True, return_partials=False, **kwargs):
162
+ # This torch version is designed to cope with a batch of same lengths wavs
163
+ if not using_partials:
164
+ frames = wav_to_mel_spectrogram_batch(wavs)
165
+ embeds = embed_frames_batch(frames)
166
+ if return_partials:
167
+ return embeds, None, None
168
+ return embeds
169
+
170
+ wave_slices, mel_slices = compute_partial_slices(wavs.shape[-1], **kwargs)
171
+ max_wave_length = wave_slices[-1].stop
172
+ if max_wave_length >= wavs.shape[-1]:
173
+ wavs = torch.cat([wavs, torch.ones((wavs.shape[0], max_wave_length - wavs.shape[-1]),
174
+ dtype=wavs.dtype, device=wavs.device)], 1)
175
+
176
+ frames = wav_to_mel_spectrogram_batch(wavs)
177
+ frames_batch = []
178
+ for i in range(len(frames)):
179
+ frames_batch += [frames[i][s] for s in mel_slices]
180
+ frames_batch = torch.stack(frames_batch, 0)
181
+ partial_embeds = embed_frames_batch(frames_batch, use_torch=True)
182
+ partial_embeds = partial_embeds.view(wavs.shape[0], len(mel_slices), -1)
183
+
184
+ raw_embeds = torch.mean(partial_embeds, axis=1, keepdims=False)
185
+ embeds = raw_embeds / torch.linalg.norm(raw_embeds, axis=-1, keepdims=True)
186
+
187
+ if return_partials:
188
+ return embeds, partial_embeds, wave_slices
189
+ return embeds
190
+
191
+
192
+ def embed_speaker(wavs, **kwargs):
193
+ raise NotImplemented()
194
+
195
+
196
+ def plot_embedding_as_heatmap(embed, ax=None, title="", shape=None, color_range=(0, 0.30)):
197
+ if ax is None:
198
+ ax = plt.gca()
199
+
200
+ if shape is None:
201
+ height = int(np.sqrt(len(embed)))
202
+ shape = (height, -1)
203
+ embed = embed.reshape(shape)
204
+
205
+ cmap = cm.get_cmap()
206
+ mappable = ax.imshow(embed, cmap=cmap)
207
+ cbar = plt.colorbar(mappable, ax=ax, fraction=0.046, pad=0.04)
208
+ cbar.set_clim(*color_range)
209
+
210
+ ax.set_xticks([]), ax.set_yticks([])
211
+ ax.set_title(title)
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/model.py ADDED
@@ -0,0 +1,137 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ from .params_model import *
4
+ from .params_data import *
5
+ from scipy.interpolate import interp1d
6
+ from sklearn.metrics import roc_curve
7
+ from torch.nn.utils import clip_grad_norm_
8
+ from scipy.optimize import brentq
9
+ from torch import nn
10
+ import numpy as np
11
+ import torch
12
+
13
+
14
+ class SpeakerEncoder(nn.Module):
15
+ def __init__(self, device, loss_device):
16
+ super().__init__()
17
+ self.loss_device = loss_device
18
+
19
+ # Network defition
20
+ self.lstm = nn.LSTM(input_size=mel_n_channels,
21
+ hidden_size=model_hidden_size,
22
+ num_layers=model_num_layers,
23
+ batch_first=True).to(device)
24
+ self.linear = nn.Linear(in_features=model_hidden_size,
25
+ out_features=model_embedding_size).to(device)
26
+ self.relu = torch.nn.ReLU().to(device)
27
+
28
+ # Cosine similarity scaling (with fixed initial parameter values)
29
+ self.similarity_weight = nn.Parameter(torch.tensor([10.])).to(loss_device)
30
+ self.similarity_bias = nn.Parameter(torch.tensor([-5.])).to(loss_device)
31
+
32
+ # Loss
33
+ self.loss_fn = nn.CrossEntropyLoss().to(loss_device)
34
+
35
+ def do_gradient_ops(self):
36
+ # Gradient scale
37
+ self.similarity_weight.grad *= 0.01
38
+ self.similarity_bias.grad *= 0.01
39
+
40
+ # Gradient clipping
41
+ clip_grad_norm_(self.parameters(), 3, norm_type=2)
42
+
43
+ def forward(self, utterances, hidden_init=None):
44
+ """
45
+ Computes the embeddings of a batch of utterance spectrograms.
46
+
47
+ :param utterances: batch of mel-scale filterbanks of same duration as a tensor of shape
48
+ (batch_size, n_frames, n_channels)
49
+ :param hidden_init: initial hidden state of the LSTM as a tensor of shape (num_layers,
50
+ batch_size, hidden_size). Will default to a tensor of zeros if None.
51
+ :return: the embeddings as a tensor of shape (batch_size, embedding_size)
52
+ """
53
+ # Pass the input through the LSTM layers and retrieve all outputs, the final hidden state
54
+ # and the final cell state.
55
+ out, (hidden, cell) = self.lstm(utterances, hidden_init)
56
+
57
+ # We take only the hidden state of the last layer
58
+ embeds_raw = self.relu(self.linear(hidden[-1]))
59
+
60
+ # L2-normalize it
61
+ embeds = embeds_raw / torch.norm(embeds_raw, dim=1, keepdim=True)
62
+
63
+ return embeds
64
+
65
+ def similarity_matrix(self, embeds):
66
+ """
67
+ Computes the similarity matrix according the section 2.1 of GE2E.
68
+
69
+ :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
70
+ utterances_per_speaker, embedding_size)
71
+ :return: the similarity matrix as a tensor of shape (speakers_per_batch,
72
+ utterances_per_speaker, speakers_per_batch)
73
+ """
74
+ speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
75
+
76
+ # Inclusive centroids (1 per speaker). Cloning is needed for reverse differentiation
77
+ centroids_incl = torch.mean(embeds, dim=1, keepdim=True)
78
+ centroids_incl = centroids_incl.clone() / torch.norm(centroids_incl, dim=2, keepdim=True)
79
+
80
+ # Exclusive centroids (1 per utterance)
81
+ centroids_excl = (torch.sum(embeds, dim=1, keepdim=True) - embeds)
82
+ centroids_excl /= (utterances_per_speaker - 1)
83
+ centroids_excl = centroids_excl.clone() / torch.norm(centroids_excl, dim=2, keepdim=True)
84
+
85
+ # Similarity matrix. The cosine similarity of already 2-normed vectors is simply the dot
86
+ # product of these vectors (which is just an element-wise multiplication reduced by a sum).
87
+ # We vectorize the computation for efficiency.
88
+ sim_matrix = torch.zeros(speakers_per_batch, utterances_per_speaker,
89
+ speakers_per_batch).to(self.loss_device)
90
+ mask_matrix = 1 - np.eye(speakers_per_batch, dtype=np.int)
91
+ for j in range(speakers_per_batch):
92
+ mask = np.where(mask_matrix[j])[0]
93
+ sim_matrix[mask, :, j] = (embeds[mask] * centroids_incl[j]).sum(dim=2)
94
+ sim_matrix[j, :, j] = (embeds[j] * centroids_excl[j]).sum(dim=1)
95
+
96
+ ## Even more vectorized version (slower maybe because of transpose)
97
+ # sim_matrix2 = torch.zeros(speakers_per_batch, speakers_per_batch, utterances_per_speaker
98
+ # ).to(self.loss_device)
99
+ # eye = np.eye(speakers_per_batch, dtype=np.int)
100
+ # mask = np.where(1 - eye)
101
+ # sim_matrix2[mask] = (embeds[mask[0]] * centroids_incl[mask[1]]).sum(dim=2)
102
+ # mask = np.where(eye)
103
+ # sim_matrix2[mask] = (embeds * centroids_excl).sum(dim=2)
104
+ # sim_matrix2 = sim_matrix2.transpose(1, 2)
105
+
106
+ sim_matrix = sim_matrix * self.similarity_weight + self.similarity_bias
107
+ return sim_matrix
108
+
109
+ def loss(self, embeds):
110
+ """
111
+ Computes the softmax loss according the section 2.1 of GE2E.
112
+
113
+ :param embeds: the embeddings as a tensor of shape (speakers_per_batch,
114
+ utterances_per_speaker, embedding_size)
115
+ :return: the loss and the EER for this batch of embeddings.
116
+ """
117
+ speakers_per_batch, utterances_per_speaker = embeds.shape[:2]
118
+
119
+ # Loss
120
+ sim_matrix = self.similarity_matrix(embeds)
121
+ sim_matrix = sim_matrix.reshape((speakers_per_batch * utterances_per_speaker,
122
+ speakers_per_batch))
123
+ ground_truth = np.repeat(np.arange(speakers_per_batch), utterances_per_speaker)
124
+ target = torch.from_numpy(ground_truth).long().to(self.loss_device)
125
+ loss = self.loss_fn(sim_matrix, target)
126
+
127
+ # EER (not backpropagated)
128
+ with torch.no_grad():
129
+ inv_argmax = lambda i: np.eye(1, speakers_per_batch, i, dtype=np.int)[0]
130
+ labels = np.array([inv_argmax(i) for i in ground_truth])
131
+ preds = sim_matrix.detach().cpu().numpy()
132
+
133
+ # Snippet from https://yangcha.github.io/EER-ROC/
134
+ fpr, tpr, thresholds = roc_curve(labels.flatten(), preds.flatten())
135
+ eer = brentq(lambda x: 1. - x - interp1d(fpr, tpr)(x), 0., 1.)
136
+
137
+ return loss, eer
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/params_data.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ ## Mel-filterbank
4
+ mel_window_length = 25 # In milliseconds
5
+ mel_window_step = 10 # In milliseconds
6
+ mel_n_channels = 40
7
+
8
+
9
+ ## Audio
10
+ sampling_rate = 16000
11
+ # Number of spectrogram frames in a partial utterance
12
+ partials_n_frames = 160 # 1600 ms
13
+ # Number of spectrogram frames at inference
14
+ inference_n_frames = 80 # 800 ms
15
+
16
+
17
+ ## Voice Activation Detection
18
+ # Window size of the VAD. Must be either 10, 20 or 30 milliseconds.
19
+ # This sets the granularity of the VAD. Should not need to be changed.
20
+ vad_window_length = 30 # In milliseconds
21
+ # Number of frames to average together when performing the moving average smoothing.
22
+ # The larger this value, the larger the VAD variations must be to not get smoothed out.
23
+ vad_moving_average_width = 8
24
+ # Maximum number of consecutive silent frames a segment can have.
25
+ vad_max_silence_length = 6
26
+
27
+
28
+ ## Audio volume normalization
29
+ audio_norm_target_dBFS = -30
30
+
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/params_model.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ ## Model parameters
4
+ model_hidden_size = 256
5
+ model_embedding_size = 256
6
+ model_num_layers = 3
7
+
8
+
9
+ ## Training parameters
10
+ learning_rate_init = 1e-4
11
+ speakers_per_batch = 64
12
+ utterances_per_speaker = 10
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/preprocess.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ from multiprocess.pool import ThreadPool
4
+ from .params_data import *
5
+ from .config import librispeech_datasets, anglophone_nationalites
6
+ from datetime import datetime
7
+ from .audio import preprocess_wav, wav_to_mel_spectrogram, preprocess_wav_batch, wav_to_mel_spectrogram_batch
8
+ from pathlib import Path
9
+ from tqdm import tqdm
10
+ import numpy as np
11
+
12
+
13
+ class DatasetLog:
14
+ """
15
+ Registers metadata about the dataset in a text file.
16
+ """
17
+ def __init__(self, root, name):
18
+ self.text_file = open(Path(root, "Log_%s.txt" % name.replace("/", "_")), "w")
19
+ self.sample_data = dict()
20
+
21
+ start_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
22
+ self.write_line("Creating dataset %s on %s" % (name, start_time))
23
+ self.write_line("-----")
24
+ self._log_params()
25
+
26
+ def _log_params(self):
27
+ from encoder import params_data
28
+ self.write_line("Parameter values:")
29
+ for param_name in (p for p in dir(params_data) if not p.startswith("__")):
30
+ value = getattr(params_data, param_name)
31
+ self.write_line("\t%s: %s" % (param_name, value))
32
+ self.write_line("-----")
33
+
34
+ def write_line(self, line):
35
+ self.text_file.write("%s\n" % line)
36
+
37
+ def add_sample(self, **kwargs):
38
+ for param_name, value in kwargs.items():
39
+ if not param_name in self.sample_data:
40
+ self.sample_data[param_name] = []
41
+ self.sample_data[param_name].append(value)
42
+
43
+ def finalize(self):
44
+ self.write_line("Statistics:")
45
+ for param_name, values in self.sample_data.items():
46
+ self.write_line("\t%s:" % param_name)
47
+ self.write_line("\t\tmin %.3f, max %.3f" % (np.min(values), np.max(values)))
48
+ self.write_line("\t\tmean %.3f, median %.3f" % (np.mean(values), np.median(values)))
49
+ self.write_line("-----")
50
+ end_time = str(datetime.now().strftime("%A %d %B %Y at %H:%M"))
51
+ self.write_line("Finished on %s" % end_time)
52
+ self.text_file.close()
53
+
54
+
55
+ def _init_preprocess_dataset(dataset_name, datasets_root, out_dir) -> (Path, DatasetLog):
56
+ dataset_root = datasets_root.joinpath(dataset_name)
57
+ if not dataset_root.exists():
58
+ print("Couldn\'t find %s, skipping this dataset." % dataset_root)
59
+ return None, None
60
+ return dataset_root, DatasetLog(out_dir, dataset_name)
61
+
62
+
63
+ def _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, extension,
64
+ skip_existing, logger):
65
+ print("%s: Preprocessing data for %d speakers." % (dataset_name, len(speaker_dirs)))
66
+
67
+ # Function to preprocess utterances for one speaker
68
+ def preprocess_speaker(speaker_dir: Path):
69
+ # Give a name to the speaker that includes its dataset
70
+ speaker_name = "_".join(speaker_dir.relative_to(datasets_root).parts)
71
+
72
+ # Create an output directory with that name, as well as a txt file containing a
73
+ # reference to each source file.
74
+ speaker_out_dir = out_dir.joinpath(speaker_name)
75
+ speaker_out_dir.mkdir(exist_ok=True)
76
+ sources_fpath = speaker_out_dir.joinpath("_sources.txt")
77
+
78
+ # There's a possibility that the preprocessing was interrupted earlier, check if
79
+ # there already is a sources file.
80
+ if sources_fpath.exists():
81
+ try:
82
+ with sources_fpath.open("r") as sources_file:
83
+ existing_fnames = {line.split(",")[0] for line in sources_file}
84
+ except:
85
+ existing_fnames = {}
86
+ else:
87
+ existing_fnames = {}
88
+
89
+ # Gather all audio files for that speaker recursively
90
+ sources_file = sources_fpath.open("a" if skip_existing else "w")
91
+ for in_fpath in speaker_dir.glob("**/*.%s" % extension):
92
+ # Check if the target output file already exists
93
+ out_fname = "_".join(in_fpath.relative_to(speaker_dir).parts)
94
+ out_fname = out_fname.replace(".%s" % extension, ".npy")
95
+ if skip_existing and out_fname in existing_fnames:
96
+ continue
97
+
98
+ # Load and preprocess the waveform
99
+ wav = preprocess_wav(in_fpath)
100
+ if len(wav) == 0:
101
+ continue
102
+
103
+ # Create the mel spectrogram, discard those that are too short
104
+ frames = wav_to_mel_spectrogram(wav)
105
+ if len(frames) < partials_n_frames:
106
+ continue
107
+
108
+ out_fpath = speaker_out_dir.joinpath(out_fname)
109
+ np.save(out_fpath, frames)
110
+ logger.add_sample(duration=len(wav) / sampling_rate)
111
+ sources_file.write("%s,%s\n" % (out_fname, in_fpath))
112
+
113
+ sources_file.close()
114
+
115
+ # Process the utterances for each speaker
116
+ with ThreadPool(8) as pool:
117
+ list(tqdm(pool.imap(preprocess_speaker, speaker_dirs), dataset_name, len(speaker_dirs),
118
+ unit="speakers"))
119
+ logger.finalize()
120
+ print("Done preprocessing %s.\n" % dataset_name)
121
+
122
+
123
+ def preprocess_librispeech(datasets_root: Path, out_dir: Path, skip_existing=False):
124
+ for dataset_name in librispeech_datasets["train"]["other"]:
125
+ # Initialize the preprocessing
126
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
127
+ if not dataset_root:
128
+ return
129
+
130
+ # Preprocess all speakers
131
+ speaker_dirs = list(dataset_root.glob("*"))
132
+ _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "flac",
133
+ skip_existing, logger)
134
+
135
+
136
+ def preprocess_voxceleb1(datasets_root: Path, out_dir: Path, skip_existing=False):
137
+ # Initialize the preprocessing
138
+ dataset_name = "VoxCeleb1"
139
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
140
+ if not dataset_root:
141
+ return
142
+
143
+ # Get the contents of the meta file
144
+ with dataset_root.joinpath("vox1_meta.csv").open("r") as metafile:
145
+ metadata = [line.split("\t") for line in metafile][1:]
146
+
147
+ # Select the ID and the nationality, filter out non-anglophone speakers
148
+ nationalities = {line[0]: line[3] for line in metadata}
149
+ keep_speaker_ids = [speaker_id for speaker_id, nationality in nationalities.items() if
150
+ nationality.lower() in anglophone_nationalites]
151
+ print("VoxCeleb1: using samples from %d (presumed anglophone) speakers out of %d." %
152
+ (len(keep_speaker_ids), len(nationalities)))
153
+
154
+ # Get the speaker directories for anglophone speakers only
155
+ speaker_dirs = dataset_root.joinpath("wav").glob("*")
156
+ speaker_dirs = [speaker_dir for speaker_dir in speaker_dirs if
157
+ speaker_dir.name in keep_speaker_ids]
158
+ print("VoxCeleb1: found %d anglophone speakers on the disk, %d missing (this is normal)." %
159
+ (len(speaker_dirs), len(keep_speaker_ids) - len(speaker_dirs)))
160
+
161
+ # Preprocess all speakers
162
+ _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "wav",
163
+ skip_existing, logger)
164
+
165
+
166
+ def preprocess_voxceleb2(datasets_root: Path, out_dir: Path, skip_existing=False):
167
+ # Initialize the preprocessing
168
+ dataset_name = "VoxCeleb2"
169
+ dataset_root, logger = _init_preprocess_dataset(dataset_name, datasets_root, out_dir)
170
+ if not dataset_root:
171
+ return
172
+
173
+ # Get the speaker directories
174
+ # Preprocess all speakers
175
+ speaker_dirs = list(dataset_root.joinpath("dev", "aac").glob("*"))
176
+ _preprocess_speaker_dirs(speaker_dirs, dataset_name, datasets_root, out_dir, "m4a",
177
+ skip_existing, logger)
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/train.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ from .visualizations import Visualizations
4
+ from .data_objects import SpeakerVerificationDataLoader, SpeakerVerificationDataset
5
+ from .params_model import *
6
+ from .model import SpeakerEncoder
7
+ from .utils.profiler import Profiler
8
+ from pathlib import Path
9
+ import torch
10
+
11
+ def sync(device: torch.device):
12
+ # FIXME
13
+ return
14
+ # For correct profiling (cuda operations are async)
15
+ if device.type == "cuda":
16
+ torch.cuda.synchronize(device)
17
+
18
+ def train(run_id: str, clean_data_root: Path, models_dir: Path, umap_every: int, save_every: int,
19
+ backup_every: int, vis_every: int, force_restart: bool, visdom_server: str,
20
+ no_visdom: bool):
21
+ # Create a dataset and a dataloader
22
+ dataset = SpeakerVerificationDataset(clean_data_root)
23
+ loader = SpeakerVerificationDataLoader(
24
+ dataset,
25
+ speakers_per_batch,
26
+ utterances_per_speaker,
27
+ num_workers=8,
28
+ )
29
+
30
+ # Setup the device on which to run the forward pass and the loss. These can be different,
31
+ # because the forward pass is faster on the GPU whereas the loss is often (depending on your
32
+ # hyperparameters) faster on the CPU.
33
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
34
+ # FIXME: currently, the gradient is None if loss_device is cuda
35
+ loss_device = torch.device("cpu")
36
+
37
+ # Create the model and the optimizer
38
+ model = SpeakerEncoder(device, loss_device)
39
+ optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate_init)
40
+ init_step = 1
41
+
42
+ # Configure file path for the model
43
+ state_fpath = models_dir.joinpath(run_id + ".pt")
44
+ backup_dir = models_dir.joinpath(run_id + "_backups")
45
+
46
+ # Load any existing model
47
+ if not force_restart:
48
+ if state_fpath.exists():
49
+ print("Found existing model \"%s\", loading it and resuming training." % run_id)
50
+ checkpoint = torch.load(state_fpath)
51
+ init_step = checkpoint["step"]
52
+ model.load_state_dict(checkpoint["model_state"])
53
+ optimizer.load_state_dict(checkpoint["optimizer_state"])
54
+ optimizer.param_groups[0]["lr"] = learning_rate_init
55
+ else:
56
+ print("No model \"%s\" found, starting training from scratch." % run_id)
57
+ else:
58
+ print("Starting the training from scratch.")
59
+ model.train()
60
+
61
+ # Initialize the visualization environment
62
+ vis = Visualizations(run_id, vis_every, server=visdom_server, disabled=no_visdom)
63
+ vis.log_dataset(dataset)
64
+ vis.log_params()
65
+ device_name = str(torch.cuda.get_device_name(0) if torch.cuda.is_available() else "CPU")
66
+ vis.log_implementation({"Device": device_name})
67
+
68
+ # Training loop
69
+ profiler = Profiler(summarize_every=10, disabled=False)
70
+ for step, speaker_batch in enumerate(loader, init_step):
71
+ profiler.tick("Blocking, waiting for batch (threaded)")
72
+
73
+ # Forward pass
74
+ inputs = torch.from_numpy(speaker_batch.data).to(device)
75
+ sync(device)
76
+ profiler.tick("Data to %s" % device)
77
+ embeds = model(inputs)
78
+ sync(device)
79
+ profiler.tick("Forward pass")
80
+ embeds_loss = embeds.view((speakers_per_batch, utterances_per_speaker, -1)).to(loss_device)
81
+ loss, eer = model.loss(embeds_loss)
82
+ sync(loss_device)
83
+ profiler.tick("Loss")
84
+
85
+ # Backward pass
86
+ model.zero_grad()
87
+ loss.backward()
88
+ profiler.tick("Backward pass")
89
+ model.do_gradient_ops()
90
+ optimizer.step()
91
+ profiler.tick("Parameter update")
92
+
93
+ # Update visualizations
94
+ # learning_rate = optimizer.param_groups[0]["lr"]
95
+ vis.update(loss.item(), eer, step)
96
+
97
+ # Draw projections and save them to the backup folder
98
+ if umap_every != 0 and step % umap_every == 0:
99
+ print("Drawing and saving projections (step %d)" % step)
100
+ backup_dir.mkdir(exist_ok=True)
101
+ projection_fpath = backup_dir.joinpath("%s_umap_%06d.png" % (run_id, step))
102
+ embeds = embeds.detach().cpu().numpy()
103
+ vis.draw_projections(embeds, utterances_per_speaker, step, projection_fpath)
104
+ vis.save()
105
+
106
+ # Overwrite the latest version of the model
107
+ if save_every != 0 and step % save_every == 0:
108
+ print("Saving the model (step %d)" % step)
109
+ torch.save({
110
+ "step": step + 1,
111
+ "model_state": model.state_dict(),
112
+ "optimizer_state": optimizer.state_dict(),
113
+ }, state_fpath)
114
+
115
+ # Make a backup
116
+ if backup_every != 0 and step % backup_every == 0:
117
+ print("Making a backup (step %d)" % step)
118
+ backup_dir.mkdir(exist_ok=True)
119
+ backup_fpath = backup_dir.joinpath("%s_bak_%06d.pt" % (run_id, step))
120
+ torch.save({
121
+ "step": step + 1,
122
+ "model_state": model.state_dict(),
123
+ "optimizer_state": optimizer.state_dict(),
124
+ }, backup_fpath)
125
+
126
+ profiler.tick("Extras (visualizations, saving)")
127
+
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/__init__.py ADDED
@@ -0,0 +1 @@
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/argutils.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ from pathlib import Path
4
+ import numpy as np
5
+ import argparse
6
+
7
+ _type_priorities = [ # In decreasing order
8
+ Path,
9
+ str,
10
+ int,
11
+ float,
12
+ bool,
13
+ ]
14
+
15
+ def _priority(o):
16
+ p = next((i for i, t in enumerate(_type_priorities) if type(o) is t), None)
17
+ if p is not None:
18
+ return p
19
+ p = next((i for i, t in enumerate(_type_priorities) if isinstance(o, t)), None)
20
+ if p is not None:
21
+ return p
22
+ return len(_type_priorities)
23
+
24
+ def print_args(args: argparse.Namespace, parser=None):
25
+ args = vars(args)
26
+ if parser is None:
27
+ priorities = list(map(_priority, args.values()))
28
+ else:
29
+ all_params = [a.dest for g in parser._action_groups for a in g._group_actions ]
30
+ priority = lambda p: all_params.index(p) if p in all_params else len(all_params)
31
+ priorities = list(map(priority, args.keys()))
32
+
33
+ pad = max(map(len, args.keys())) + 3
34
+ indices = np.lexsort((list(args.keys()), priorities))
35
+ items = list(args.items())
36
+
37
+ print("Arguments:")
38
+ for i in indices:
39
+ param, value = items[i]
40
+ print(" {0}:{1}{2}".format(param, ' ' * (pad - len(param)), value))
41
+ print("")
42
+
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/logmmse.py ADDED
@@ -0,0 +1,222 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ import numpy as np
4
+ import math
5
+ from scipy.special import expn
6
+ from collections import namedtuple
7
+
8
+ NoiseProfile = namedtuple("NoiseProfile", "sampling_rate window_size len1 len2 win n_fft noise_mu2")
9
+
10
+
11
+ def profile_noise(noise, sampling_rate, window_size=0):
12
+ """
13
+ Creates a profile of the noise in a given waveform.
14
+
15
+ :param noise: a waveform containing noise ONLY, as a numpy array of floats or ints.
16
+ :param sampling_rate: the sampling rate of the audio
17
+ :param window_size: the size of the window the logmmse algorithm operates on. A default value
18
+ will be picked if left as 0.
19
+ :return: a NoiseProfile object
20
+ """
21
+ noise, dtype = to_float(noise)
22
+ noise += np.finfo(np.float64).eps
23
+
24
+ if window_size == 0:
25
+ window_size = int(math.floor(0.02 * sampling_rate))
26
+
27
+ if window_size % 2 == 1:
28
+ window_size = window_size + 1
29
+
30
+ perc = 50
31
+ len1 = int(math.floor(window_size * perc / 100))
32
+ len2 = int(window_size - len1)
33
+
34
+ win = np.hanning(window_size)
35
+ win = win * len2 / np.sum(win)
36
+ n_fft = 2 * window_size
37
+
38
+ noise_mean = np.zeros(n_fft)
39
+ n_frames = len(noise) // window_size
40
+ for j in range(0, window_size * n_frames, window_size):
41
+ noise_mean += np.absolute(np.fft.fft(win * noise[j:j + window_size], n_fft, axis=0))
42
+ noise_mu2 = (noise_mean / n_frames) ** 2
43
+
44
+ return NoiseProfile(sampling_rate, window_size, len1, len2, win, n_fft, noise_mu2)
45
+
46
+
47
+ def denoise(wav, noise_profile: NoiseProfile, eta=0.15):
48
+ """
49
+ Cleans the noise from a speech waveform given a noise profile. The waveform must have the
50
+ same sampling rate as the one used to create the noise profile.
51
+
52
+ :param wav: a speech waveform as a numpy array of floats or ints.
53
+ :param noise_profile: a NoiseProfile object that was created from a similar (or a segment of
54
+ the same) waveform.
55
+ :param eta: voice threshold for noise update. While the voice activation detection value is
56
+ below this threshold, the noise profile will be continuously updated throughout the audio.
57
+ Set to 0 to disable updating the noise profile.
58
+ :return: the clean wav as a numpy array of floats or ints of the same length.
59
+ """
60
+ wav, dtype = to_float(wav)
61
+ wav += np.finfo(np.float64).eps
62
+ p = noise_profile
63
+
64
+ nframes = int(math.floor(len(wav) / p.len2) - math.floor(p.window_size / p.len2))
65
+ x_final = np.zeros(nframes * p.len2)
66
+
67
+ aa = 0.98
68
+ mu = 0.98
69
+ ksi_min = 10 ** (-25 / 10)
70
+
71
+ x_old = np.zeros(p.len1)
72
+ xk_prev = np.zeros(p.len1)
73
+ noise_mu2 = p.noise_mu2
74
+ for k in range(0, nframes * p.len2, p.len2):
75
+ insign = p.win * wav[k:k + p.window_size]
76
+
77
+ spec = np.fft.fft(insign, p.n_fft, axis=0)
78
+ sig = np.absolute(spec)
79
+ sig2 = sig ** 2
80
+
81
+ gammak = np.minimum(sig2 / noise_mu2, 40)
82
+
83
+ if xk_prev.all() == 0:
84
+ ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
85
+ else:
86
+ ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
87
+ ksi = np.maximum(ksi_min, ksi)
88
+
89
+ log_sigma_k = gammak * ksi/(1 + ksi) - np.log(1 + ksi)
90
+ vad_decision = np.sum(log_sigma_k) / p.window_size
91
+ if vad_decision < eta:
92
+ noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
93
+
94
+ a = ksi / (1 + ksi)
95
+ vk = a * gammak
96
+ ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
97
+ hw = a * np.exp(ei_vk)
98
+ sig = sig * hw
99
+ xk_prev = sig ** 2
100
+ xi_w = np.fft.ifft(hw * spec, p.n_fft, axis=0)
101
+ xi_w = np.real(xi_w)
102
+
103
+ x_final[k:k + p.len2] = x_old + xi_w[0:p.len1]
104
+ x_old = xi_w[p.len1:p.window_size]
105
+
106
+ output = from_float(x_final, dtype)
107
+ output = np.pad(output, (0, len(wav) - len(output)), mode="constant")
108
+ return output
109
+
110
+
111
+ ## Alternative VAD algorithm to webrctvad. It has the advantage of not requiring to install that
112
+ ## darn package and it also works for any sampling rate. Maybe I'll eventually use it instead of
113
+ ## webrctvad
114
+ # def vad(wav, sampling_rate, eta=0.15, window_size=0):
115
+ # """
116
+ # TODO: fix doc
117
+ # Creates a profile of the noise in a given waveform.
118
+ #
119
+ # :param wav: a waveform containing noise ONLY, as a numpy array of floats or ints.
120
+ # :param sampling_rate: the sampling rate of the audio
121
+ # :param window_size: the size of the window the logmmse algorithm operates on. A default value
122
+ # will be picked if left as 0.
123
+ # :param eta: voice threshold for noise update. While the voice activation detection value is
124
+ # below this threshold, the noise profile will be continuously updated throughout the audio.
125
+ # Set to 0 to disable updating the noise profile.
126
+ # """
127
+ # wav, dtype = to_float(wav)
128
+ # wav += np.finfo(np.float64).eps
129
+ #
130
+ # if window_size == 0:
131
+ # window_size = int(math.floor(0.02 * sampling_rate))
132
+ #
133
+ # if window_size % 2 == 1:
134
+ # window_size = window_size + 1
135
+ #
136
+ # perc = 50
137
+ # len1 = int(math.floor(window_size * perc / 100))
138
+ # len2 = int(window_size - len1)
139
+ #
140
+ # win = np.hanning(window_size)
141
+ # win = win * len2 / np.sum(win)
142
+ # n_fft = 2 * window_size
143
+ #
144
+ # wav_mean = np.zeros(n_fft)
145
+ # n_frames = len(wav) // window_size
146
+ # for j in range(0, window_size * n_frames, window_size):
147
+ # wav_mean += np.absolute(np.fft.fft(win * wav[j:j + window_size], n_fft, axis=0))
148
+ # noise_mu2 = (wav_mean / n_frames) ** 2
149
+ #
150
+ # wav, dtype = to_float(wav)
151
+ # wav += np.finfo(np.float64).eps
152
+ #
153
+ # nframes = int(math.floor(len(wav) / len2) - math.floor(window_size / len2))
154
+ # vad = np.zeros(nframes * len2, dtype=np.bool)
155
+ #
156
+ # aa = 0.98
157
+ # mu = 0.98
158
+ # ksi_min = 10 ** (-25 / 10)
159
+ #
160
+ # xk_prev = np.zeros(len1)
161
+ # noise_mu2 = noise_mu2
162
+ # for k in range(0, nframes * len2, len2):
163
+ # insign = win * wav[k:k + window_size]
164
+ #
165
+ # spec = np.fft.fft(insign, n_fft, axis=0)
166
+ # sig = np.absolute(spec)
167
+ # sig2 = sig ** 2
168
+ #
169
+ # gammak = np.minimum(sig2 / noise_mu2, 40)
170
+ #
171
+ # if xk_prev.all() == 0:
172
+ # ksi = aa + (1 - aa) * np.maximum(gammak - 1, 0)
173
+ # else:
174
+ # ksi = aa * xk_prev / noise_mu2 + (1 - aa) * np.maximum(gammak - 1, 0)
175
+ # ksi = np.maximum(ksi_min, ksi)
176
+ #
177
+ # log_sigma_k = gammak * ksi / (1 + ksi) - np.log(1 + ksi)
178
+ # vad_decision = np.sum(log_sigma_k) / window_size
179
+ # if vad_decision < eta:
180
+ # noise_mu2 = mu * noise_mu2 + (1 - mu) * sig2
181
+ # print(vad_decision)
182
+ #
183
+ # a = ksi / (1 + ksi)
184
+ # vk = a * gammak
185
+ # ei_vk = 0.5 * expn(1, np.maximum(vk, 1e-8))
186
+ # hw = a * np.exp(ei_vk)
187
+ # sig = sig * hw
188
+ # xk_prev = sig ** 2
189
+ #
190
+ # vad[k:k + len2] = vad_decision >= eta
191
+ #
192
+ # vad = np.pad(vad, (0, len(wav) - len(vad)), mode="constant")
193
+ # return vad
194
+
195
+
196
+ def to_float(_input):
197
+ if _input.dtype == np.float64:
198
+ return _input, _input.dtype
199
+ elif _input.dtype == np.float32:
200
+ return _input.astype(np.float64), _input.dtype
201
+ elif _input.dtype == np.uint8:
202
+ return (_input - 128) / 128., _input.dtype
203
+ elif _input.dtype == np.int16:
204
+ return _input / 32768., _input.dtype
205
+ elif _input.dtype == np.int32:
206
+ return _input / 2147483648., _input.dtype
207
+ raise ValueError('Unsupported wave file format')
208
+
209
+
210
+ def from_float(_input, dtype):
211
+ if dtype == np.float64:
212
+ return _input, np.float64
213
+ elif dtype == np.float32:
214
+ return _input.astype(np.float32)
215
+ elif dtype == np.uint8:
216
+ return ((_input * 128) + 128).astype(np.uint8)
217
+ elif dtype == np.int16:
218
+ return (_input * 32768).astype(np.int16)
219
+ elif dtype == np.int32:
220
+ print(_input)
221
+ return (_input * 2147483648).astype(np.int32)
222
+ raise ValueError('Unsupported wave file format')
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/utils/profiler.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ from time import perf_counter as timer
4
+ from collections import OrderedDict
5
+ import numpy as np
6
+
7
+
8
+ class Profiler:
9
+ def __init__(self, summarize_every=5, disabled=False):
10
+ self.last_tick = timer()
11
+ self.logs = OrderedDict()
12
+ self.summarize_every = summarize_every
13
+ self.disabled = disabled
14
+
15
+ def tick(self, name):
16
+ if self.disabled:
17
+ return
18
+
19
+ # Log the time needed to execute that function
20
+ if not name in self.logs:
21
+ self.logs[name] = []
22
+ if len(self.logs[name]) >= self.summarize_every:
23
+ self.summarize()
24
+ self.purge_logs()
25
+ self.logs[name].append(timer() - self.last_tick)
26
+
27
+ self.reset_timer()
28
+
29
+ def purge_logs(self):
30
+ for name in self.logs:
31
+ self.logs[name].clear()
32
+
33
+ def reset_timer(self):
34
+ self.last_tick = timer()
35
+
36
+ def summarize(self):
37
+ n = max(map(len, self.logs.values()))
38
+ assert n == self.summarize_every
39
+ print("\nAverage execution time over %d steps:" % n)
40
+
41
+ name_msgs = ["%s (%d/%d):" % (name, len(deltas), n) for name, deltas in self.logs.items()]
42
+ pad = max(map(len, name_msgs))
43
+ for name_msg, deltas in zip(name_msgs, self.logs.values()):
44
+ print(" %s mean: %4.0fms std: %4.0fms" %
45
+ (name_msg.ljust(pad), np.mean(deltas) * 1000, np.std(deltas) * 1000))
46
+ print("", flush=True)
47
+
dreamvoice/train_utils/src/modules/speaker_encoder/encoder/visualizations.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/CorentinJ/Real-Time-Voice-Cloning """
2
+
3
+ from .data_objects.speaker_verification_dataset import SpeakerVerificationDataset
4
+ from datetime import datetime
5
+ from time import perf_counter as timer
6
+ import matplotlib.pyplot as plt
7
+ import numpy as np
8
+ # import webbrowser
9
+ import visdom
10
+ import umap
11
+
12
+ colormap = np.array([
13
+ [76, 255, 0],
14
+ [0, 127, 70],
15
+ [255, 0, 0],
16
+ [255, 217, 38],
17
+ [0, 135, 255],
18
+ [165, 0, 165],
19
+ [255, 167, 255],
20
+ [0, 255, 255],
21
+ [255, 96, 38],
22
+ [142, 76, 0],
23
+ [33, 0, 127],
24
+ [0, 0, 0],
25
+ [183, 183, 183],
26
+ ], dtype=np.float) / 255
27
+
28
+
29
+ class Visualizations:
30
+ def __init__(self, env_name=None, update_every=10, server="http://localhost", disabled=False):
31
+ # Tracking data
32
+ self.last_update_timestamp = timer()
33
+ self.update_every = update_every
34
+ self.step_times = []
35
+ self.losses = []
36
+ self.eers = []
37
+ print("Updating the visualizations every %d steps." % update_every)
38
+
39
+ # If visdom is disabled TODO: use a better paradigm for that
40
+ self.disabled = disabled
41
+ if self.disabled:
42
+ return
43
+
44
+ # Set the environment name
45
+ now = str(datetime.now().strftime("%d-%m %Hh%M"))
46
+ if env_name is None:
47
+ self.env_name = now
48
+ else:
49
+ self.env_name = "%s (%s)" % (env_name, now)
50
+
51
+ # Connect to visdom and open the corresponding window in the browser
52
+ try:
53
+ self.vis = visdom.Visdom(server, env=self.env_name, raise_exceptions=True)
54
+ except ConnectionError:
55
+ raise Exception("No visdom server detected. Run the command \"visdom\" in your CLI to "
56
+ "start it.")
57
+ # webbrowser.open("http://localhost:8097/env/" + self.env_name)
58
+
59
+ # Create the windows
60
+ self.loss_win = None
61
+ self.eer_win = None
62
+ # self.lr_win = None
63
+ self.implementation_win = None
64
+ self.projection_win = None
65
+ self.implementation_string = ""
66
+
67
+ def log_params(self):
68
+ if self.disabled:
69
+ return
70
+ from encoder import params_data
71
+ from encoder import params_model
72
+ param_string = "<b>Model parameters</b>:<br>"
73
+ for param_name in (p for p in dir(params_model) if not p.startswith("__")):
74
+ value = getattr(params_model, param_name)
75
+ param_string += "\t%s: %s<br>" % (param_name, value)
76
+ param_string += "<b>Data parameters</b>:<br>"
77
+ for param_name in (p for p in dir(params_data) if not p.startswith("__")):
78
+ value = getattr(params_data, param_name)
79
+ param_string += "\t%s: %s<br>" % (param_name, value)
80
+ self.vis.text(param_string, opts={"title": "Parameters"})
81
+
82
+ def log_dataset(self, dataset: SpeakerVerificationDataset):
83
+ if self.disabled:
84
+ return
85
+ dataset_string = ""
86
+ dataset_string += "<b>Speakers</b>: %s\n" % len(dataset.speakers)
87
+ dataset_string += "\n" + dataset.get_logs()
88
+ dataset_string = dataset_string.replace("\n", "<br>")
89
+ self.vis.text(dataset_string, opts={"title": "Dataset"})
90
+
91
+ def log_implementation(self, params):
92
+ if self.disabled:
93
+ return
94
+ implementation_string = ""
95
+ for param, value in params.items():
96
+ implementation_string += "<b>%s</b>: %s\n" % (param, value)
97
+ implementation_string = implementation_string.replace("\n", "<br>")
98
+ self.implementation_string = implementation_string
99
+ self.implementation_win = self.vis.text(
100
+ implementation_string,
101
+ opts={"title": "Training implementation"}
102
+ )
103
+
104
+ def update(self, loss, eer, step):
105
+ # Update the tracking data
106
+ now = timer()
107
+ self.step_times.append(1000 * (now - self.last_update_timestamp))
108
+ self.last_update_timestamp = now
109
+ self.losses.append(loss)
110
+ self.eers.append(eer)
111
+ print(".", end="")
112
+
113
+ # Update the plots every <update_every> steps
114
+ if step % self.update_every != 0:
115
+ return
116
+ time_string = "Step time: mean: %5dms std: %5dms" % \
117
+ (int(np.mean(self.step_times)), int(np.std(self.step_times)))
118
+ print("\nStep %6d Loss: %.4f EER: %.4f %s" %
119
+ (step, np.mean(self.losses), np.mean(self.eers), time_string))
120
+ if not self.disabled:
121
+ self.loss_win = self.vis.line(
122
+ [np.mean(self.losses)],
123
+ [step],
124
+ win=self.loss_win,
125
+ update="append" if self.loss_win else None,
126
+ opts=dict(
127
+ legend=["Avg. loss"],
128
+ xlabel="Step",
129
+ ylabel="Loss",
130
+ title="Loss",
131
+ )
132
+ )
133
+ self.eer_win = self.vis.line(
134
+ [np.mean(self.eers)],
135
+ [step],
136
+ win=self.eer_win,
137
+ update="append" if self.eer_win else None,
138
+ opts=dict(
139
+ legend=["Avg. EER"],
140
+ xlabel="Step",
141
+ ylabel="EER",
142
+ title="Equal error rate"
143
+ )
144
+ )
145
+ if self.implementation_win is not None:
146
+ self.vis.text(
147
+ self.implementation_string + ("<b>%s</b>" % time_string),
148
+ win=self.implementation_win,
149
+ opts={"title": "Training implementation"},
150
+ )
151
+
152
+ # Reset the tracking
153
+ self.losses.clear()
154
+ self.eers.clear()
155
+ self.step_times.clear()
156
+
157
+ def draw_projections(self, embeds, utterances_per_speaker, step, out_fpath=None,
158
+ max_speakers=10):
159
+ max_speakers = min(max_speakers, len(colormap))
160
+ embeds = embeds[:max_speakers * utterances_per_speaker]
161
+
162
+ n_speakers = len(embeds) // utterances_per_speaker
163
+ ground_truth = np.repeat(np.arange(n_speakers), utterances_per_speaker)
164
+ colors = [colormap[i] for i in ground_truth]
165
+
166
+ reducer = umap.UMAP()
167
+ projected = reducer.fit_transform(embeds)
168
+ plt.scatter(projected[:, 0], projected[:, 1], c=colors)
169
+ plt.gca().set_aspect("equal", "datalim")
170
+ plt.title("UMAP projection (step %d)" % step)
171
+ if not self.disabled:
172
+ self.projection_win = self.vis.matplot(plt, win=self.projection_win)
173
+ if out_fpath is not None:
174
+ plt.savefig(out_fpath)
175
+ plt.clf()
176
+
177
+ def save(self):
178
+ if not self.disabled:
179
+ self.vis.save([self.env_name])
180
+
dreamvoice/train_utils/src/openvoice/__init__.py ADDED
File without changes
dreamvoice/train_utils/src/openvoice/api.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import re
4
+ import soundfile
5
+ from openvoice import utils
6
+ from openvoice import commons
7
+ import os
8
+ import librosa
9
+ from openvoice.text import text_to_sequence
10
+ from openvoice.mel_processing import spectrogram_torch
11
+ from openvoice.models import SynthesizerTrn
12
+
13
+
14
+ class OpenVoiceBaseClass(object):
15
+ def __init__(self,
16
+ config_path,
17
+ device='cuda:0'):
18
+ if 'cuda' in device:
19
+ assert torch.cuda.is_available()
20
+
21
+ hps = utils.get_hparams_from_file(config_path)
22
+
23
+ model = SynthesizerTrn(
24
+ len(getattr(hps, 'symbols', [])),
25
+ hps.data.filter_length // 2 + 1,
26
+ n_speakers=hps.data.n_speakers,
27
+ **hps.model,
28
+ ).to(device)
29
+
30
+ model.eval()
31
+ self.model = model
32
+ self.hps = hps
33
+ self.device = device
34
+
35
+ def load_ckpt(self, ckpt_path):
36
+ checkpoint_dict = torch.load(ckpt_path, map_location=torch.device(self.device))
37
+ a, b = self.model.load_state_dict(checkpoint_dict['model'], strict=False)
38
+ print("Loaded checkpoint '{}'".format(ckpt_path))
39
+ print('missing/unexpected keys:', a, b)
40
+
41
+
42
+ class BaseSpeakerTTS(OpenVoiceBaseClass):
43
+ language_marks = {
44
+ "english": "EN",
45
+ "chinese": "ZH",
46
+ }
47
+
48
+ @staticmethod
49
+ def get_text(text, hps, is_symbol):
50
+ text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
51
+ if hps.data.add_blank:
52
+ text_norm = commons.intersperse(text_norm, 0)
53
+ text_norm = torch.LongTensor(text_norm)
54
+ return text_norm
55
+
56
+ @staticmethod
57
+ def audio_numpy_concat(segment_data_list, sr, speed=1.):
58
+ audio_segments = []
59
+ for segment_data in segment_data_list:
60
+ audio_segments += segment_data.reshape(-1).tolist()
61
+ audio_segments += [0] * int((sr * 0.05)/speed)
62
+ audio_segments = np.array(audio_segments).astype(np.float32)
63
+ return audio_segments
64
+
65
+ @staticmethod
66
+ def split_sentences_into_pieces(text, language_str):
67
+ texts = utils.split_sentence(text, language_str=language_str)
68
+ print(" > Text splitted to sentences.")
69
+ print('\n'.join(texts))
70
+ print(" > ===========================")
71
+ return texts
72
+
73
+ def tts(self, text, output_path, speaker, language='English', speed=1.0):
74
+ mark = self.language_marks.get(language.lower(), None)
75
+ assert mark is not None, f"language {language} is not supported"
76
+
77
+ texts = self.split_sentences_into_pieces(text, mark)
78
+
79
+ audio_list = []
80
+ for t in texts:
81
+ t = re.sub(r'([a-z])([A-Z])', r'\1 \2', t)
82
+ t = f'[{mark}]{t}[{mark}]'
83
+ stn_tst = self.get_text(t, self.hps, False)
84
+ device = self.device
85
+ speaker_id = self.hps.speakers[speaker]
86
+ with torch.no_grad():
87
+ x_tst = stn_tst.unsqueeze(0).to(device)
88
+ x_tst_lengths = torch.LongTensor([stn_tst.size(0)]).to(device)
89
+ sid = torch.LongTensor([speaker_id]).to(device)
90
+ audio = self.model.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=0.667, noise_scale_w=0.6,
91
+ length_scale=1.0 / speed)[0][0, 0].data.cpu().float().numpy()
92
+ audio_list.append(audio)
93
+ audio = self.audio_numpy_concat(audio_list, sr=self.hps.data.sampling_rate, speed=speed)
94
+
95
+ if output_path is None:
96
+ return audio
97
+ else:
98
+ soundfile.write(output_path, audio, self.hps.data.sampling_rate)
99
+
100
+
101
+ class ToneColorConverter(OpenVoiceBaseClass):
102
+ def __init__(self, *args, **kwargs):
103
+ super().__init__(*args, **kwargs)
104
+
105
+ if kwargs.get('enable_watermark', True):
106
+ import wavmark
107
+ self.watermark_model = wavmark.load_model().to(self.device)
108
+ else:
109
+ self.watermark_model = None
110
+ self.version = getattr(self.hps, '_version_', "v1")
111
+
112
+
113
+
114
+ def extract_se(self, ref_wav_list, se_save_path=None):
115
+ if isinstance(ref_wav_list, str):
116
+ ref_wav_list = [ref_wav_list]
117
+
118
+ device = self.device
119
+ hps = self.hps
120
+ gs = []
121
+
122
+ for fname in ref_wav_list:
123
+ audio_ref, sr = librosa.load(fname, sr=hps.data.sampling_rate)
124
+ y = torch.FloatTensor(audio_ref)
125
+ y = y.to(device)
126
+ y = y.unsqueeze(0)
127
+ y = spectrogram_torch(y, hps.data.filter_length,
128
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
129
+ center=False).to(device)
130
+ with torch.no_grad():
131
+ g = self.model.ref_enc(y.transpose(1, 2)).unsqueeze(-1)
132
+ gs.append(g.detach())
133
+ gs = torch.stack(gs).mean(0)
134
+
135
+ if se_save_path is not None:
136
+ os.makedirs(os.path.dirname(se_save_path), exist_ok=True)
137
+ torch.save(gs.cpu(), se_save_path)
138
+
139
+ return gs
140
+
141
+ def convert(self, audio_src_path, src_se, tgt_se, output_path=None, tau=0.3, message="default"):
142
+ hps = self.hps
143
+ # load audio
144
+ audio, sample_rate = librosa.load(audio_src_path, sr=hps.data.sampling_rate)
145
+ audio = torch.tensor(audio).float()
146
+
147
+ with torch.no_grad():
148
+ y = torch.FloatTensor(audio).to(self.device)
149
+ y = y.unsqueeze(0)
150
+ spec = spectrogram_torch(y, hps.data.filter_length,
151
+ hps.data.sampling_rate, hps.data.hop_length, hps.data.win_length,
152
+ center=False).to(self.device)
153
+ spec_lengths = torch.LongTensor([spec.size(-1)]).to(self.device)
154
+ audio = self.model.voice_conversion(spec, spec_lengths, sid_src=src_se, sid_tgt=tgt_se, tau=tau)[0][
155
+ 0, 0].data.cpu().float().numpy()
156
+ audio = self.add_watermark(audio, message)
157
+ if output_path is None:
158
+ return audio
159
+ else:
160
+ soundfile.write(output_path, audio, hps.data.sampling_rate)
161
+
162
+ def add_watermark(self, audio, message):
163
+ if self.watermark_model is None:
164
+ return audio
165
+ device = self.device
166
+ bits = utils.string_to_bits(message).reshape(-1)
167
+ n_repeat = len(bits) // 32
168
+
169
+ K = 16000
170
+ coeff = 2
171
+ for n in range(n_repeat):
172
+ trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
173
+ if len(trunck) != K:
174
+ print('Audio too short, fail to add watermark')
175
+ break
176
+ message_npy = bits[n * 32: (n + 1) * 32]
177
+
178
+ with torch.no_grad():
179
+ signal = torch.FloatTensor(trunck).to(device)[None]
180
+ message_tensor = torch.FloatTensor(message_npy).to(device)[None]
181
+ signal_wmd_tensor = self.watermark_model.encode(signal, message_tensor)
182
+ signal_wmd_npy = signal_wmd_tensor.detach().cpu().squeeze()
183
+ audio[(coeff * n) * K: (coeff * n + 1) * K] = signal_wmd_npy
184
+ return audio
185
+
186
+ def detect_watermark(self, audio, n_repeat):
187
+ bits = []
188
+ K = 16000
189
+ coeff = 2
190
+ for n in range(n_repeat):
191
+ trunck = audio[(coeff * n) * K: (coeff * n + 1) * K]
192
+ if len(trunck) != K:
193
+ print('Audio too short, fail to detect watermark')
194
+ return 'Fail'
195
+ with torch.no_grad():
196
+ signal = torch.FloatTensor(trunck).to(self.device).unsqueeze(0)
197
+ message_decoded_npy = (self.watermark_model.decode(signal) >= 0.5).int().detach().cpu().numpy().squeeze()
198
+ bits.append(message_decoded_npy)
199
+ bits = np.stack(bits).reshape(-1, 8)
200
+ message = utils.bits_to_string(bits)
201
+ return message
202
+
dreamvoice/train_utils/src/openvoice/attentions.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ from openvoice import commons
7
+ import logging
8
+
9
+ logger = logging.getLogger(__name__)
10
+
11
+
12
+ class LayerNorm(nn.Module):
13
+ def __init__(self, channels, eps=1e-5):
14
+ super().__init__()
15
+ self.channels = channels
16
+ self.eps = eps
17
+
18
+ self.gamma = nn.Parameter(torch.ones(channels))
19
+ self.beta = nn.Parameter(torch.zeros(channels))
20
+
21
+ def forward(self, x):
22
+ x = x.transpose(1, -1)
23
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
24
+ return x.transpose(1, -1)
25
+
26
+
27
+ @torch.jit.script
28
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
29
+ n_channels_int = n_channels[0]
30
+ in_act = input_a + input_b
31
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
32
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
33
+ acts = t_act * s_act
34
+ return acts
35
+
36
+
37
+ class Encoder(nn.Module):
38
+ def __init__(
39
+ self,
40
+ hidden_channels,
41
+ filter_channels,
42
+ n_heads,
43
+ n_layers,
44
+ kernel_size=1,
45
+ p_dropout=0.0,
46
+ window_size=4,
47
+ isflow=True,
48
+ **kwargs
49
+ ):
50
+ super().__init__()
51
+ self.hidden_channels = hidden_channels
52
+ self.filter_channels = filter_channels
53
+ self.n_heads = n_heads
54
+ self.n_layers = n_layers
55
+ self.kernel_size = kernel_size
56
+ self.p_dropout = p_dropout
57
+ self.window_size = window_size
58
+ # if isflow:
59
+ # cond_layer = torch.nn.Conv1d(256, 2*hidden_channels*n_layers, 1)
60
+ # self.cond_pre = torch.nn.Conv1d(hidden_channels, 2*hidden_channels, 1)
61
+ # self.cond_layer = weight_norm(cond_layer, name='weight')
62
+ # self.gin_channels = 256
63
+ self.cond_layer_idx = self.n_layers
64
+ if "gin_channels" in kwargs:
65
+ self.gin_channels = kwargs["gin_channels"]
66
+ if self.gin_channels != 0:
67
+ self.spk_emb_linear = nn.Linear(self.gin_channels, self.hidden_channels)
68
+ # vits2 says 3rd block, so idx is 2 by default
69
+ self.cond_layer_idx = (
70
+ kwargs["cond_layer_idx"] if "cond_layer_idx" in kwargs else 2
71
+ )
72
+ # logging.debug(self.gin_channels, self.cond_layer_idx)
73
+ assert (
74
+ self.cond_layer_idx < self.n_layers
75
+ ), "cond_layer_idx should be less than n_layers"
76
+ self.drop = nn.Dropout(p_dropout)
77
+ self.attn_layers = nn.ModuleList()
78
+ self.norm_layers_1 = nn.ModuleList()
79
+ self.ffn_layers = nn.ModuleList()
80
+ self.norm_layers_2 = nn.ModuleList()
81
+
82
+ for i in range(self.n_layers):
83
+ self.attn_layers.append(
84
+ MultiHeadAttention(
85
+ hidden_channels,
86
+ hidden_channels,
87
+ n_heads,
88
+ p_dropout=p_dropout,
89
+ window_size=window_size,
90
+ )
91
+ )
92
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
93
+ self.ffn_layers.append(
94
+ FFN(
95
+ hidden_channels,
96
+ hidden_channels,
97
+ filter_channels,
98
+ kernel_size,
99
+ p_dropout=p_dropout,
100
+ )
101
+ )
102
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
103
+
104
+ def forward(self, x, x_mask, g=None):
105
+ attn_mask = x_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
106
+ x = x * x_mask
107
+ for i in range(self.n_layers):
108
+ if i == self.cond_layer_idx and g is not None:
109
+ g = self.spk_emb_linear(g.transpose(1, 2))
110
+ g = g.transpose(1, 2)
111
+ x = x + g
112
+ x = x * x_mask
113
+ y = self.attn_layers[i](x, x, attn_mask)
114
+ y = self.drop(y)
115
+ x = self.norm_layers_1[i](x + y)
116
+
117
+ y = self.ffn_layers[i](x, x_mask)
118
+ y = self.drop(y)
119
+ x = self.norm_layers_2[i](x + y)
120
+ x = x * x_mask
121
+ return x
122
+
123
+
124
+ class Decoder(nn.Module):
125
+ def __init__(
126
+ self,
127
+ hidden_channels,
128
+ filter_channels,
129
+ n_heads,
130
+ n_layers,
131
+ kernel_size=1,
132
+ p_dropout=0.0,
133
+ proximal_bias=False,
134
+ proximal_init=True,
135
+ **kwargs
136
+ ):
137
+ super().__init__()
138
+ self.hidden_channels = hidden_channels
139
+ self.filter_channels = filter_channels
140
+ self.n_heads = n_heads
141
+ self.n_layers = n_layers
142
+ self.kernel_size = kernel_size
143
+ self.p_dropout = p_dropout
144
+ self.proximal_bias = proximal_bias
145
+ self.proximal_init = proximal_init
146
+
147
+ self.drop = nn.Dropout(p_dropout)
148
+ self.self_attn_layers = nn.ModuleList()
149
+ self.norm_layers_0 = nn.ModuleList()
150
+ self.encdec_attn_layers = nn.ModuleList()
151
+ self.norm_layers_1 = nn.ModuleList()
152
+ self.ffn_layers = nn.ModuleList()
153
+ self.norm_layers_2 = nn.ModuleList()
154
+ for i in range(self.n_layers):
155
+ self.self_attn_layers.append(
156
+ MultiHeadAttention(
157
+ hidden_channels,
158
+ hidden_channels,
159
+ n_heads,
160
+ p_dropout=p_dropout,
161
+ proximal_bias=proximal_bias,
162
+ proximal_init=proximal_init,
163
+ )
164
+ )
165
+ self.norm_layers_0.append(LayerNorm(hidden_channels))
166
+ self.encdec_attn_layers.append(
167
+ MultiHeadAttention(
168
+ hidden_channels, hidden_channels, n_heads, p_dropout=p_dropout
169
+ )
170
+ )
171
+ self.norm_layers_1.append(LayerNorm(hidden_channels))
172
+ self.ffn_layers.append(
173
+ FFN(
174
+ hidden_channels,
175
+ hidden_channels,
176
+ filter_channels,
177
+ kernel_size,
178
+ p_dropout=p_dropout,
179
+ causal=True,
180
+ )
181
+ )
182
+ self.norm_layers_2.append(LayerNorm(hidden_channels))
183
+
184
+ def forward(self, x, x_mask, h, h_mask):
185
+ """
186
+ x: decoder input
187
+ h: encoder output
188
+ """
189
+ self_attn_mask = commons.subsequent_mask(x_mask.size(2)).to(
190
+ device=x.device, dtype=x.dtype
191
+ )
192
+ encdec_attn_mask = h_mask.unsqueeze(2) * x_mask.unsqueeze(-1)
193
+ x = x * x_mask
194
+ for i in range(self.n_layers):
195
+ y = self.self_attn_layers[i](x, x, self_attn_mask)
196
+ y = self.drop(y)
197
+ x = self.norm_layers_0[i](x + y)
198
+
199
+ y = self.encdec_attn_layers[i](x, h, encdec_attn_mask)
200
+ y = self.drop(y)
201
+ x = self.norm_layers_1[i](x + y)
202
+
203
+ y = self.ffn_layers[i](x, x_mask)
204
+ y = self.drop(y)
205
+ x = self.norm_layers_2[i](x + y)
206
+ x = x * x_mask
207
+ return x
208
+
209
+
210
+ class MultiHeadAttention(nn.Module):
211
+ def __init__(
212
+ self,
213
+ channels,
214
+ out_channels,
215
+ n_heads,
216
+ p_dropout=0.0,
217
+ window_size=None,
218
+ heads_share=True,
219
+ block_length=None,
220
+ proximal_bias=False,
221
+ proximal_init=False,
222
+ ):
223
+ super().__init__()
224
+ assert channels % n_heads == 0
225
+
226
+ self.channels = channels
227
+ self.out_channels = out_channels
228
+ self.n_heads = n_heads
229
+ self.p_dropout = p_dropout
230
+ self.window_size = window_size
231
+ self.heads_share = heads_share
232
+ self.block_length = block_length
233
+ self.proximal_bias = proximal_bias
234
+ self.proximal_init = proximal_init
235
+ self.attn = None
236
+
237
+ self.k_channels = channels // n_heads
238
+ self.conv_q = nn.Conv1d(channels, channels, 1)
239
+ self.conv_k = nn.Conv1d(channels, channels, 1)
240
+ self.conv_v = nn.Conv1d(channels, channels, 1)
241
+ self.conv_o = nn.Conv1d(channels, out_channels, 1)
242
+ self.drop = nn.Dropout(p_dropout)
243
+
244
+ if window_size is not None:
245
+ n_heads_rel = 1 if heads_share else n_heads
246
+ rel_stddev = self.k_channels**-0.5
247
+ self.emb_rel_k = nn.Parameter(
248
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
249
+ * rel_stddev
250
+ )
251
+ self.emb_rel_v = nn.Parameter(
252
+ torch.randn(n_heads_rel, window_size * 2 + 1, self.k_channels)
253
+ * rel_stddev
254
+ )
255
+
256
+ nn.init.xavier_uniform_(self.conv_q.weight)
257
+ nn.init.xavier_uniform_(self.conv_k.weight)
258
+ nn.init.xavier_uniform_(self.conv_v.weight)
259
+ if proximal_init:
260
+ with torch.no_grad():
261
+ self.conv_k.weight.copy_(self.conv_q.weight)
262
+ self.conv_k.bias.copy_(self.conv_q.bias)
263
+
264
+ def forward(self, x, c, attn_mask=None):
265
+ q = self.conv_q(x)
266
+ k = self.conv_k(c)
267
+ v = self.conv_v(c)
268
+
269
+ x, self.attn = self.attention(q, k, v, mask=attn_mask)
270
+
271
+ x = self.conv_o(x)
272
+ return x
273
+
274
+ def attention(self, query, key, value, mask=None):
275
+ # reshape [b, d, t] -> [b, n_h, t, d_k]
276
+ b, d, t_s, t_t = (*key.size(), query.size(2))
277
+ query = query.view(b, self.n_heads, self.k_channels, t_t).transpose(2, 3)
278
+ key = key.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
279
+ value = value.view(b, self.n_heads, self.k_channels, t_s).transpose(2, 3)
280
+
281
+ scores = torch.matmul(query / math.sqrt(self.k_channels), key.transpose(-2, -1))
282
+ if self.window_size is not None:
283
+ assert (
284
+ t_s == t_t
285
+ ), "Relative attention is only available for self-attention."
286
+ key_relative_embeddings = self._get_relative_embeddings(self.emb_rel_k, t_s)
287
+ rel_logits = self._matmul_with_relative_keys(
288
+ query / math.sqrt(self.k_channels), key_relative_embeddings
289
+ )
290
+ scores_local = self._relative_position_to_absolute_position(rel_logits)
291
+ scores = scores + scores_local
292
+ if self.proximal_bias:
293
+ assert t_s == t_t, "Proximal bias is only available for self-attention."
294
+ scores = scores + self._attention_bias_proximal(t_s).to(
295
+ device=scores.device, dtype=scores.dtype
296
+ )
297
+ if mask is not None:
298
+ scores = scores.masked_fill(mask == 0, -1e4)
299
+ if self.block_length is not None:
300
+ assert (
301
+ t_s == t_t
302
+ ), "Local attention is only available for self-attention."
303
+ block_mask = (
304
+ torch.ones_like(scores)
305
+ .triu(-self.block_length)
306
+ .tril(self.block_length)
307
+ )
308
+ scores = scores.masked_fill(block_mask == 0, -1e4)
309
+ p_attn = F.softmax(scores, dim=-1) # [b, n_h, t_t, t_s]
310
+ p_attn = self.drop(p_attn)
311
+ output = torch.matmul(p_attn, value)
312
+ if self.window_size is not None:
313
+ relative_weights = self._absolute_position_to_relative_position(p_attn)
314
+ value_relative_embeddings = self._get_relative_embeddings(
315
+ self.emb_rel_v, t_s
316
+ )
317
+ output = output + self._matmul_with_relative_values(
318
+ relative_weights, value_relative_embeddings
319
+ )
320
+ output = (
321
+ output.transpose(2, 3).contiguous().view(b, d, t_t)
322
+ ) # [b, n_h, t_t, d_k] -> [b, d, t_t]
323
+ return output, p_attn
324
+
325
+ def _matmul_with_relative_values(self, x, y):
326
+ """
327
+ x: [b, h, l, m]
328
+ y: [h or 1, m, d]
329
+ ret: [b, h, l, d]
330
+ """
331
+ ret = torch.matmul(x, y.unsqueeze(0))
332
+ return ret
333
+
334
+ def _matmul_with_relative_keys(self, x, y):
335
+ """
336
+ x: [b, h, l, d]
337
+ y: [h or 1, m, d]
338
+ ret: [b, h, l, m]
339
+ """
340
+ ret = torch.matmul(x, y.unsqueeze(0).transpose(-2, -1))
341
+ return ret
342
+
343
+ def _get_relative_embeddings(self, relative_embeddings, length):
344
+ 2 * self.window_size + 1
345
+ # Pad first before slice to avoid using cond ops.
346
+ pad_length = max(length - (self.window_size + 1), 0)
347
+ slice_start_position = max((self.window_size + 1) - length, 0)
348
+ slice_end_position = slice_start_position + 2 * length - 1
349
+ if pad_length > 0:
350
+ padded_relative_embeddings = F.pad(
351
+ relative_embeddings,
352
+ commons.convert_pad_shape([[0, 0], [pad_length, pad_length], [0, 0]]),
353
+ )
354
+ else:
355
+ padded_relative_embeddings = relative_embeddings
356
+ used_relative_embeddings = padded_relative_embeddings[
357
+ :, slice_start_position:slice_end_position
358
+ ]
359
+ return used_relative_embeddings
360
+
361
+ def _relative_position_to_absolute_position(self, x):
362
+ """
363
+ x: [b, h, l, 2*l-1]
364
+ ret: [b, h, l, l]
365
+ """
366
+ batch, heads, length, _ = x.size()
367
+ # Concat columns of pad to shift from relative to absolute indexing.
368
+ x = F.pad(x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, 1]]))
369
+
370
+ # Concat extra elements so to add up to shape (len+1, 2*len-1).
371
+ x_flat = x.view([batch, heads, length * 2 * length])
372
+ x_flat = F.pad(
373
+ x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [0, length - 1]])
374
+ )
375
+
376
+ # Reshape and slice out the padded elements.
377
+ x_final = x_flat.view([batch, heads, length + 1, 2 * length - 1])[
378
+ :, :, :length, length - 1 :
379
+ ]
380
+ return x_final
381
+
382
+ def _absolute_position_to_relative_position(self, x):
383
+ """
384
+ x: [b, h, l, l]
385
+ ret: [b, h, l, 2*l-1]
386
+ """
387
+ batch, heads, length, _ = x.size()
388
+ # pad along column
389
+ x = F.pad(
390
+ x, commons.convert_pad_shape([[0, 0], [0, 0], [0, 0], [0, length - 1]])
391
+ )
392
+ x_flat = x.view([batch, heads, length**2 + length * (length - 1)])
393
+ # add 0's in the beginning that will skew the elements after reshape
394
+ x_flat = F.pad(x_flat, commons.convert_pad_shape([[0, 0], [0, 0], [length, 0]]))
395
+ x_final = x_flat.view([batch, heads, length, 2 * length])[:, :, :, 1:]
396
+ return x_final
397
+
398
+ def _attention_bias_proximal(self, length):
399
+ """Bias for self-attention to encourage attention to close positions.
400
+ Args:
401
+ length: an integer scalar.
402
+ Returns:
403
+ a Tensor with shape [1, 1, length, length]
404
+ """
405
+ r = torch.arange(length, dtype=torch.float32)
406
+ diff = torch.unsqueeze(r, 0) - torch.unsqueeze(r, 1)
407
+ return torch.unsqueeze(torch.unsqueeze(-torch.log1p(torch.abs(diff)), 0), 0)
408
+
409
+
410
+ class FFN(nn.Module):
411
+ def __init__(
412
+ self,
413
+ in_channels,
414
+ out_channels,
415
+ filter_channels,
416
+ kernel_size,
417
+ p_dropout=0.0,
418
+ activation=None,
419
+ causal=False,
420
+ ):
421
+ super().__init__()
422
+ self.in_channels = in_channels
423
+ self.out_channels = out_channels
424
+ self.filter_channels = filter_channels
425
+ self.kernel_size = kernel_size
426
+ self.p_dropout = p_dropout
427
+ self.activation = activation
428
+ self.causal = causal
429
+
430
+ if causal:
431
+ self.padding = self._causal_padding
432
+ else:
433
+ self.padding = self._same_padding
434
+
435
+ self.conv_1 = nn.Conv1d(in_channels, filter_channels, kernel_size)
436
+ self.conv_2 = nn.Conv1d(filter_channels, out_channels, kernel_size)
437
+ self.drop = nn.Dropout(p_dropout)
438
+
439
+ def forward(self, x, x_mask):
440
+ x = self.conv_1(self.padding(x * x_mask))
441
+ if self.activation == "gelu":
442
+ x = x * torch.sigmoid(1.702 * x)
443
+ else:
444
+ x = torch.relu(x)
445
+ x = self.drop(x)
446
+ x = self.conv_2(self.padding(x * x_mask))
447
+ return x * x_mask
448
+
449
+ def _causal_padding(self, x):
450
+ if self.kernel_size == 1:
451
+ return x
452
+ pad_l = self.kernel_size - 1
453
+ pad_r = 0
454
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
455
+ x = F.pad(x, commons.convert_pad_shape(padding))
456
+ return x
457
+
458
+ def _same_padding(self, x):
459
+ if self.kernel_size == 1:
460
+ return x
461
+ pad_l = (self.kernel_size - 1) // 2
462
+ pad_r = self.kernel_size // 2
463
+ padding = [[0, 0], [0, 0], [pad_l, pad_r]]
464
+ x = F.pad(x, commons.convert_pad_shape(padding))
465
+ return x
dreamvoice/train_utils/src/openvoice/commons.py ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch.nn import functional as F
4
+
5
+
6
+ def init_weights(m, mean=0.0, std=0.01):
7
+ classname = m.__class__.__name__
8
+ if classname.find("Conv") != -1:
9
+ m.weight.data.normal_(mean, std)
10
+
11
+
12
+ def get_padding(kernel_size, dilation=1):
13
+ return int((kernel_size * dilation - dilation) / 2)
14
+
15
+
16
+ def convert_pad_shape(pad_shape):
17
+ layer = pad_shape[::-1]
18
+ pad_shape = [item for sublist in layer for item in sublist]
19
+ return pad_shape
20
+
21
+
22
+ def intersperse(lst, item):
23
+ result = [item] * (len(lst) * 2 + 1)
24
+ result[1::2] = lst
25
+ return result
26
+
27
+
28
+ def kl_divergence(m_p, logs_p, m_q, logs_q):
29
+ """KL(P||Q)"""
30
+ kl = (logs_q - logs_p) - 0.5
31
+ kl += (
32
+ 0.5 * (torch.exp(2.0 * logs_p) + ((m_p - m_q) ** 2)) * torch.exp(-2.0 * logs_q)
33
+ )
34
+ return kl
35
+
36
+
37
+ def rand_gumbel(shape):
38
+ """Sample from the Gumbel distribution, protect from overflows."""
39
+ uniform_samples = torch.rand(shape) * 0.99998 + 0.00001
40
+ return -torch.log(-torch.log(uniform_samples))
41
+
42
+
43
+ def rand_gumbel_like(x):
44
+ g = rand_gumbel(x.size()).to(dtype=x.dtype, device=x.device)
45
+ return g
46
+
47
+
48
+ def slice_segments(x, ids_str, segment_size=4):
49
+ ret = torch.zeros_like(x[:, :, :segment_size])
50
+ for i in range(x.size(0)):
51
+ idx_str = ids_str[i]
52
+ idx_end = idx_str + segment_size
53
+ ret[i] = x[i, :, idx_str:idx_end]
54
+ return ret
55
+
56
+
57
+ def rand_slice_segments(x, x_lengths=None, segment_size=4):
58
+ b, d, t = x.size()
59
+ if x_lengths is None:
60
+ x_lengths = t
61
+ ids_str_max = x_lengths - segment_size + 1
62
+ ids_str = (torch.rand([b]).to(device=x.device) * ids_str_max).to(dtype=torch.long)
63
+ ret = slice_segments(x, ids_str, segment_size)
64
+ return ret, ids_str
65
+
66
+
67
+ def get_timing_signal_1d(length, channels, min_timescale=1.0, max_timescale=1.0e4):
68
+ position = torch.arange(length, dtype=torch.float)
69
+ num_timescales = channels // 2
70
+ log_timescale_increment = math.log(float(max_timescale) / float(min_timescale)) / (
71
+ num_timescales - 1
72
+ )
73
+ inv_timescales = min_timescale * torch.exp(
74
+ torch.arange(num_timescales, dtype=torch.float) * -log_timescale_increment
75
+ )
76
+ scaled_time = position.unsqueeze(0) * inv_timescales.unsqueeze(1)
77
+ signal = torch.cat([torch.sin(scaled_time), torch.cos(scaled_time)], 0)
78
+ signal = F.pad(signal, [0, 0, 0, channels % 2])
79
+ signal = signal.view(1, channels, length)
80
+ return signal
81
+
82
+
83
+ def add_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4):
84
+ b, channels, length = x.size()
85
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
86
+ return x + signal.to(dtype=x.dtype, device=x.device)
87
+
88
+
89
+ def cat_timing_signal_1d(x, min_timescale=1.0, max_timescale=1.0e4, axis=1):
90
+ b, channels, length = x.size()
91
+ signal = get_timing_signal_1d(length, channels, min_timescale, max_timescale)
92
+ return torch.cat([x, signal.to(dtype=x.dtype, device=x.device)], axis)
93
+
94
+
95
+ def subsequent_mask(length):
96
+ mask = torch.tril(torch.ones(length, length)).unsqueeze(0).unsqueeze(0)
97
+ return mask
98
+
99
+
100
+ @torch.jit.script
101
+ def fused_add_tanh_sigmoid_multiply(input_a, input_b, n_channels):
102
+ n_channels_int = n_channels[0]
103
+ in_act = input_a + input_b
104
+ t_act = torch.tanh(in_act[:, :n_channels_int, :])
105
+ s_act = torch.sigmoid(in_act[:, n_channels_int:, :])
106
+ acts = t_act * s_act
107
+ return acts
108
+
109
+
110
+ def convert_pad_shape(pad_shape):
111
+ layer = pad_shape[::-1]
112
+ pad_shape = [item for sublist in layer for item in sublist]
113
+ return pad_shape
114
+
115
+
116
+ def shift_1d(x):
117
+ x = F.pad(x, convert_pad_shape([[0, 0], [0, 0], [1, 0]]))[:, :, :-1]
118
+ return x
119
+
120
+
121
+ def sequence_mask(length, max_length=None):
122
+ if max_length is None:
123
+ max_length = length.max()
124
+ x = torch.arange(max_length, dtype=length.dtype, device=length.device)
125
+ return x.unsqueeze(0) < length.unsqueeze(1)
126
+
127
+
128
+ def generate_path(duration, mask):
129
+ """
130
+ duration: [b, 1, t_x]
131
+ mask: [b, 1, t_y, t_x]
132
+ """
133
+
134
+ b, _, t_y, t_x = mask.shape
135
+ cum_duration = torch.cumsum(duration, -1)
136
+
137
+ cum_duration_flat = cum_duration.view(b * t_x)
138
+ path = sequence_mask(cum_duration_flat, t_y).to(mask.dtype)
139
+ path = path.view(b, t_x, t_y)
140
+ path = path - F.pad(path, convert_pad_shape([[0, 0], [1, 0], [0, 0]]))[:, :-1]
141
+ path = path.unsqueeze(1).transpose(2, 3) * mask
142
+ return path
143
+
144
+
145
+ def clip_grad_value_(parameters, clip_value, norm_type=2):
146
+ if isinstance(parameters, torch.Tensor):
147
+ parameters = [parameters]
148
+ parameters = list(filter(lambda p: p.grad is not None, parameters))
149
+ norm_type = float(norm_type)
150
+ if clip_value is not None:
151
+ clip_value = float(clip_value)
152
+
153
+ total_norm = 0
154
+ for p in parameters:
155
+ param_norm = p.grad.data.norm(norm_type)
156
+ total_norm += param_norm.item() ** norm_type
157
+ if clip_value is not None:
158
+ p.grad.data.clamp_(min=-clip_value, max=clip_value)
159
+ total_norm = total_norm ** (1.0 / norm_type)
160
+ return total_norm
dreamvoice/train_utils/src/openvoice/mel_processing.py ADDED
@@ -0,0 +1,183 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.utils.data
3
+ from librosa.filters import mel as librosa_mel_fn
4
+
5
+ MAX_WAV_VALUE = 32768.0
6
+
7
+
8
+ def dynamic_range_compression_torch(x, C=1, clip_val=1e-5):
9
+ """
10
+ PARAMS
11
+ ------
12
+ C: compression factor
13
+ """
14
+ return torch.log(torch.clamp(x, min=clip_val) * C)
15
+
16
+
17
+ def dynamic_range_decompression_torch(x, C=1):
18
+ """
19
+ PARAMS
20
+ ------
21
+ C: compression factor used to compress
22
+ """
23
+ return torch.exp(x) / C
24
+
25
+
26
+ def spectral_normalize_torch(magnitudes):
27
+ output = dynamic_range_compression_torch(magnitudes)
28
+ return output
29
+
30
+
31
+ def spectral_de_normalize_torch(magnitudes):
32
+ output = dynamic_range_decompression_torch(magnitudes)
33
+ return output
34
+
35
+
36
+ mel_basis = {}
37
+ hann_window = {}
38
+
39
+
40
+ def spectrogram_torch(y, n_fft, sampling_rate, hop_size, win_size, center=False):
41
+ if torch.min(y) < -1.1:
42
+ print("min value is ", torch.min(y))
43
+ if torch.max(y) > 1.1:
44
+ print("max value is ", torch.max(y))
45
+
46
+ global hann_window
47
+ dtype_device = str(y.dtype) + "_" + str(y.device)
48
+ wnsize_dtype_device = str(win_size) + "_" + dtype_device
49
+ if wnsize_dtype_device not in hann_window:
50
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
51
+ dtype=y.dtype, device=y.device
52
+ )
53
+
54
+ y = torch.nn.functional.pad(
55
+ y.unsqueeze(1),
56
+ (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
57
+ mode="reflect",
58
+ )
59
+ y = y.squeeze(1)
60
+
61
+ spec = torch.stft(
62
+ y,
63
+ n_fft,
64
+ hop_length=hop_size,
65
+ win_length=win_size,
66
+ window=hann_window[wnsize_dtype_device],
67
+ center=center,
68
+ pad_mode="reflect",
69
+ normalized=False,
70
+ onesided=True,
71
+ return_complex=False,
72
+ )
73
+
74
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
75
+ return spec
76
+
77
+
78
+ def spectrogram_torch_conv(y, n_fft, sampling_rate, hop_size, win_size, center=False):
79
+ # if torch.min(y) < -1.:
80
+ # print('min value is ', torch.min(y))
81
+ # if torch.max(y) > 1.:
82
+ # print('max value is ', torch.max(y))
83
+
84
+ global hann_window
85
+ dtype_device = str(y.dtype) + '_' + str(y.device)
86
+ wnsize_dtype_device = str(win_size) + '_' + dtype_device
87
+ if wnsize_dtype_device not in hann_window:
88
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(dtype=y.dtype, device=y.device)
89
+
90
+ y = torch.nn.functional.pad(y.unsqueeze(1), (int((n_fft-hop_size)/2), int((n_fft-hop_size)/2)), mode='reflect')
91
+
92
+ # ******************** original ************************#
93
+ # y = y.squeeze(1)
94
+ # spec1 = torch.stft(y, n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
95
+ # center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
96
+
97
+ # ******************** ConvSTFT ************************#
98
+ freq_cutoff = n_fft // 2 + 1
99
+ fourier_basis = torch.view_as_real(torch.fft.fft(torch.eye(n_fft)))
100
+ forward_basis = fourier_basis[:freq_cutoff].permute(2, 0, 1).reshape(-1, 1, fourier_basis.shape[1])
101
+ forward_basis = forward_basis * torch.as_tensor(librosa.util.pad_center(torch.hann_window(win_size), size=n_fft)).float()
102
+
103
+ import torch.nn.functional as F
104
+
105
+ # if center:
106
+ # signal = F.pad(y[:, None, None, :], (n_fft // 2, n_fft // 2, 0, 0), mode = 'reflect').squeeze(1)
107
+ assert center is False
108
+
109
+ forward_transform_squared = F.conv1d(y, forward_basis.to(y.device), stride = hop_size)
110
+ spec2 = torch.stack([forward_transform_squared[:, :freq_cutoff, :], forward_transform_squared[:, freq_cutoff:, :]], dim = -1)
111
+
112
+
113
+ # ******************** Verification ************************#
114
+ spec1 = torch.stft(y.squeeze(1), n_fft, hop_length=hop_size, win_length=win_size, window=hann_window[wnsize_dtype_device],
115
+ center=center, pad_mode='reflect', normalized=False, onesided=True, return_complex=False)
116
+ assert torch.allclose(spec1, spec2, atol=1e-4)
117
+
118
+ spec = torch.sqrt(spec2.pow(2).sum(-1) + 1e-6)
119
+ return spec
120
+
121
+
122
+ def spec_to_mel_torch(spec, n_fft, num_mels, sampling_rate, fmin, fmax):
123
+ global mel_basis
124
+ dtype_device = str(spec.dtype) + "_" + str(spec.device)
125
+ fmax_dtype_device = str(fmax) + "_" + dtype_device
126
+ if fmax_dtype_device not in mel_basis:
127
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
128
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
129
+ dtype=spec.dtype, device=spec.device
130
+ )
131
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
132
+ spec = spectral_normalize_torch(spec)
133
+ return spec
134
+
135
+
136
+ def mel_spectrogram_torch(
137
+ y, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False
138
+ ):
139
+ if torch.min(y) < -1.0:
140
+ print("min value is ", torch.min(y))
141
+ if torch.max(y) > 1.0:
142
+ print("max value is ", torch.max(y))
143
+
144
+ global mel_basis, hann_window
145
+ dtype_device = str(y.dtype) + "_" + str(y.device)
146
+ fmax_dtype_device = str(fmax) + "_" + dtype_device
147
+ wnsize_dtype_device = str(win_size) + "_" + dtype_device
148
+ if fmax_dtype_device not in mel_basis:
149
+ mel = librosa_mel_fn(sampling_rate, n_fft, num_mels, fmin, fmax)
150
+ mel_basis[fmax_dtype_device] = torch.from_numpy(mel).to(
151
+ dtype=y.dtype, device=y.device
152
+ )
153
+ if wnsize_dtype_device not in hann_window:
154
+ hann_window[wnsize_dtype_device] = torch.hann_window(win_size).to(
155
+ dtype=y.dtype, device=y.device
156
+ )
157
+
158
+ y = torch.nn.functional.pad(
159
+ y.unsqueeze(1),
160
+ (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2)),
161
+ mode="reflect",
162
+ )
163
+ y = y.squeeze(1)
164
+
165
+ spec = torch.stft(
166
+ y,
167
+ n_fft,
168
+ hop_length=hop_size,
169
+ win_length=win_size,
170
+ window=hann_window[wnsize_dtype_device],
171
+ center=center,
172
+ pad_mode="reflect",
173
+ normalized=False,
174
+ onesided=True,
175
+ return_complex=False,
176
+ )
177
+
178
+ spec = torch.sqrt(spec.pow(2).sum(-1) + 1e-6)
179
+
180
+ spec = torch.matmul(mel_basis[fmax_dtype_device], spec)
181
+ spec = spectral_normalize_torch(spec)
182
+
183
+ return spec
dreamvoice/train_utils/src/openvoice/models.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ from openvoice import commons
7
+ from openvoice import modules
8
+ from openvoice import attentions
9
+
10
+ from torch.nn import Conv1d, ConvTranspose1d, Conv2d
11
+ from torch.nn.utils import weight_norm, remove_weight_norm, spectral_norm
12
+
13
+ from openvoice.commons import init_weights, get_padding
14
+
15
+
16
+ class TextEncoder(nn.Module):
17
+ def __init__(self,
18
+ n_vocab,
19
+ out_channels,
20
+ hidden_channels,
21
+ filter_channels,
22
+ n_heads,
23
+ n_layers,
24
+ kernel_size,
25
+ p_dropout):
26
+ super().__init__()
27
+ self.n_vocab = n_vocab
28
+ self.out_channels = out_channels
29
+ self.hidden_channels = hidden_channels
30
+ self.filter_channels = filter_channels
31
+ self.n_heads = n_heads
32
+ self.n_layers = n_layers
33
+ self.kernel_size = kernel_size
34
+ self.p_dropout = p_dropout
35
+
36
+ self.emb = nn.Embedding(n_vocab, hidden_channels)
37
+ nn.init.normal_(self.emb.weight, 0.0, hidden_channels**-0.5)
38
+
39
+ self.encoder = attentions.Encoder(
40
+ hidden_channels,
41
+ filter_channels,
42
+ n_heads,
43
+ n_layers,
44
+ kernel_size,
45
+ p_dropout)
46
+ self.proj= nn.Conv1d(hidden_channels, out_channels * 2, 1)
47
+
48
+ def forward(self, x, x_lengths):
49
+ x = self.emb(x) * math.sqrt(self.hidden_channels) # [b, t, h]
50
+ x = torch.transpose(x, 1, -1) # [b, h, t]
51
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(x.dtype)
52
+
53
+ x = self.encoder(x * x_mask, x_mask)
54
+ stats = self.proj(x) * x_mask
55
+
56
+ m, logs = torch.split(stats, self.out_channels, dim=1)
57
+ return x, m, logs, x_mask
58
+
59
+
60
+ class DurationPredictor(nn.Module):
61
+ def __init__(
62
+ self, in_channels, filter_channels, kernel_size, p_dropout, gin_channels=0
63
+ ):
64
+ super().__init__()
65
+
66
+ self.in_channels = in_channels
67
+ self.filter_channels = filter_channels
68
+ self.kernel_size = kernel_size
69
+ self.p_dropout = p_dropout
70
+ self.gin_channels = gin_channels
71
+
72
+ self.drop = nn.Dropout(p_dropout)
73
+ self.conv_1 = nn.Conv1d(
74
+ in_channels, filter_channels, kernel_size, padding=kernel_size // 2
75
+ )
76
+ self.norm_1 = modules.LayerNorm(filter_channels)
77
+ self.conv_2 = nn.Conv1d(
78
+ filter_channels, filter_channels, kernel_size, padding=kernel_size // 2
79
+ )
80
+ self.norm_2 = modules.LayerNorm(filter_channels)
81
+ self.proj = nn.Conv1d(filter_channels, 1, 1)
82
+
83
+ if gin_channels != 0:
84
+ self.cond = nn.Conv1d(gin_channels, in_channels, 1)
85
+
86
+ def forward(self, x, x_mask, g=None):
87
+ x = torch.detach(x)
88
+ if g is not None:
89
+ g = torch.detach(g)
90
+ x = x + self.cond(g)
91
+ x = self.conv_1(x * x_mask)
92
+ x = torch.relu(x)
93
+ x = self.norm_1(x)
94
+ x = self.drop(x)
95
+ x = self.conv_2(x * x_mask)
96
+ x = torch.relu(x)
97
+ x = self.norm_2(x)
98
+ x = self.drop(x)
99
+ x = self.proj(x * x_mask)
100
+ return x * x_mask
101
+
102
+ class StochasticDurationPredictor(nn.Module):
103
+ def __init__(self, in_channels, filter_channels, kernel_size, p_dropout, n_flows=4, gin_channels=0):
104
+ super().__init__()
105
+ filter_channels = in_channels # it needs to be removed from future version.
106
+ self.in_channels = in_channels
107
+ self.filter_channels = filter_channels
108
+ self.kernel_size = kernel_size
109
+ self.p_dropout = p_dropout
110
+ self.n_flows = n_flows
111
+ self.gin_channels = gin_channels
112
+
113
+ self.log_flow = modules.Log()
114
+ self.flows = nn.ModuleList()
115
+ self.flows.append(modules.ElementwiseAffine(2))
116
+ for i in range(n_flows):
117
+ self.flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
118
+ self.flows.append(modules.Flip())
119
+
120
+ self.post_pre = nn.Conv1d(1, filter_channels, 1)
121
+ self.post_proj = nn.Conv1d(filter_channels, filter_channels, 1)
122
+ self.post_convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
123
+ self.post_flows = nn.ModuleList()
124
+ self.post_flows.append(modules.ElementwiseAffine(2))
125
+ for i in range(4):
126
+ self.post_flows.append(modules.ConvFlow(2, filter_channels, kernel_size, n_layers=3))
127
+ self.post_flows.append(modules.Flip())
128
+
129
+ self.pre = nn.Conv1d(in_channels, filter_channels, 1)
130
+ self.proj = nn.Conv1d(filter_channels, filter_channels, 1)
131
+ self.convs = modules.DDSConv(filter_channels, kernel_size, n_layers=3, p_dropout=p_dropout)
132
+ if gin_channels != 0:
133
+ self.cond = nn.Conv1d(gin_channels, filter_channels, 1)
134
+
135
+ def forward(self, x, x_mask, w=None, g=None, reverse=False, noise_scale=1.0):
136
+ x = torch.detach(x)
137
+ x = self.pre(x)
138
+ if g is not None:
139
+ g = torch.detach(g)
140
+ x = x + self.cond(g)
141
+ x = self.convs(x, x_mask)
142
+ x = self.proj(x) * x_mask
143
+
144
+ if not reverse:
145
+ flows = self.flows
146
+ assert w is not None
147
+
148
+ logdet_tot_q = 0
149
+ h_w = self.post_pre(w)
150
+ h_w = self.post_convs(h_w, x_mask)
151
+ h_w = self.post_proj(h_w) * x_mask
152
+ e_q = torch.randn(w.size(0), 2, w.size(2)).to(device=x.device, dtype=x.dtype) * x_mask
153
+ z_q = e_q
154
+ for flow in self.post_flows:
155
+ z_q, logdet_q = flow(z_q, x_mask, g=(x + h_w))
156
+ logdet_tot_q += logdet_q
157
+ z_u, z1 = torch.split(z_q, [1, 1], 1)
158
+ u = torch.sigmoid(z_u) * x_mask
159
+ z0 = (w - u) * x_mask
160
+ logdet_tot_q += torch.sum((F.logsigmoid(z_u) + F.logsigmoid(-z_u)) * x_mask, [1,2])
161
+ logq = torch.sum(-0.5 * (math.log(2*math.pi) + (e_q**2)) * x_mask, [1,2]) - logdet_tot_q
162
+
163
+ logdet_tot = 0
164
+ z0, logdet = self.log_flow(z0, x_mask)
165
+ logdet_tot += logdet
166
+ z = torch.cat([z0, z1], 1)
167
+ for flow in flows:
168
+ z, logdet = flow(z, x_mask, g=x, reverse=reverse)
169
+ logdet_tot = logdet_tot + logdet
170
+ nll = torch.sum(0.5 * (math.log(2*math.pi) + (z**2)) * x_mask, [1,2]) - logdet_tot
171
+ return nll + logq # [b]
172
+ else:
173
+ flows = list(reversed(self.flows))
174
+ flows = flows[:-2] + [flows[-1]] # remove a useless vflow
175
+ z = torch.randn(x.size(0), 2, x.size(2)).to(device=x.device, dtype=x.dtype) * noise_scale
176
+ for flow in flows:
177
+ z = flow(z, x_mask, g=x, reverse=reverse)
178
+ z0, z1 = torch.split(z, [1, 1], 1)
179
+ logw = z0
180
+ return logw
181
+
182
+ class PosteriorEncoder(nn.Module):
183
+ def __init__(
184
+ self,
185
+ in_channels,
186
+ out_channels,
187
+ hidden_channels,
188
+ kernel_size,
189
+ dilation_rate,
190
+ n_layers,
191
+ gin_channels=0,
192
+ ):
193
+ super().__init__()
194
+ self.in_channels = in_channels
195
+ self.out_channels = out_channels
196
+ self.hidden_channels = hidden_channels
197
+ self.kernel_size = kernel_size
198
+ self.dilation_rate = dilation_rate
199
+ self.n_layers = n_layers
200
+ self.gin_channels = gin_channels
201
+
202
+ self.pre = nn.Conv1d(in_channels, hidden_channels, 1)
203
+ self.enc = modules.WN(
204
+ hidden_channels,
205
+ kernel_size,
206
+ dilation_rate,
207
+ n_layers,
208
+ gin_channels=gin_channels,
209
+ )
210
+ self.proj = nn.Conv1d(hidden_channels, out_channels * 2, 1)
211
+
212
+ def forward(self, x, x_lengths, g=None, tau=1.0):
213
+ x_mask = torch.unsqueeze(commons.sequence_mask(x_lengths, x.size(2)), 1).to(
214
+ x.dtype
215
+ )
216
+ x = self.pre(x) * x_mask
217
+ x = self.enc(x, x_mask, g=g)
218
+ stats = self.proj(x) * x_mask
219
+ m, logs = torch.split(stats, self.out_channels, dim=1)
220
+ z = (m + torch.randn_like(m) * tau * torch.exp(logs)) * x_mask
221
+ return z, m, logs, x_mask
222
+
223
+
224
+ class Generator(torch.nn.Module):
225
+ def __init__(
226
+ self,
227
+ initial_channel,
228
+ resblock,
229
+ resblock_kernel_sizes,
230
+ resblock_dilation_sizes,
231
+ upsample_rates,
232
+ upsample_initial_channel,
233
+ upsample_kernel_sizes,
234
+ gin_channels=0,
235
+ ):
236
+ super(Generator, self).__init__()
237
+ self.num_kernels = len(resblock_kernel_sizes)
238
+ self.num_upsamples = len(upsample_rates)
239
+ self.conv_pre = Conv1d(
240
+ initial_channel, upsample_initial_channel, 7, 1, padding=3
241
+ )
242
+ resblock = modules.ResBlock1 if resblock == "1" else modules.ResBlock2
243
+
244
+ self.ups = nn.ModuleList()
245
+ for i, (u, k) in enumerate(zip(upsample_rates, upsample_kernel_sizes)):
246
+ self.ups.append(
247
+ weight_norm(
248
+ ConvTranspose1d(
249
+ upsample_initial_channel // (2**i),
250
+ upsample_initial_channel // (2 ** (i + 1)),
251
+ k,
252
+ u,
253
+ padding=(k - u) // 2,
254
+ )
255
+ )
256
+ )
257
+
258
+ self.resblocks = nn.ModuleList()
259
+ for i in range(len(self.ups)):
260
+ ch = upsample_initial_channel // (2 ** (i + 1))
261
+ for j, (k, d) in enumerate(
262
+ zip(resblock_kernel_sizes, resblock_dilation_sizes)
263
+ ):
264
+ self.resblocks.append(resblock(ch, k, d))
265
+
266
+ self.conv_post = Conv1d(ch, 1, 7, 1, padding=3, bias=False)
267
+ self.ups.apply(init_weights)
268
+
269
+ if gin_channels != 0:
270
+ self.cond = nn.Conv1d(gin_channels, upsample_initial_channel, 1)
271
+
272
+ def forward(self, x, g=None):
273
+ x = self.conv_pre(x)
274
+ if g is not None:
275
+ x = x + self.cond(g)
276
+
277
+ for i in range(self.num_upsamples):
278
+ x = F.leaky_relu(x, modules.LRELU_SLOPE)
279
+ x = self.ups[i](x)
280
+ xs = None
281
+ for j in range(self.num_kernels):
282
+ if xs is None:
283
+ xs = self.resblocks[i * self.num_kernels + j](x)
284
+ else:
285
+ xs += self.resblocks[i * self.num_kernels + j](x)
286
+ x = xs / self.num_kernels
287
+ x = F.leaky_relu(x)
288
+ x = self.conv_post(x)
289
+ x = torch.tanh(x)
290
+
291
+ return x
292
+
293
+ def remove_weight_norm(self):
294
+ print("Removing weight norm...")
295
+ for layer in self.ups:
296
+ remove_weight_norm(layer)
297
+ for layer in self.resblocks:
298
+ layer.remove_weight_norm()
299
+
300
+
301
+ class ReferenceEncoder(nn.Module):
302
+ """
303
+ inputs --- [N, Ty/r, n_mels*r] mels
304
+ outputs --- [N, ref_enc_gru_size]
305
+ """
306
+
307
+ def __init__(self, spec_channels, gin_channels=0, layernorm=True):
308
+ super().__init__()
309
+ self.spec_channels = spec_channels
310
+ ref_enc_filters = [32, 32, 64, 64, 128, 128]
311
+ K = len(ref_enc_filters)
312
+ filters = [1] + ref_enc_filters
313
+ convs = [
314
+ weight_norm(
315
+ nn.Conv2d(
316
+ in_channels=filters[i],
317
+ out_channels=filters[i + 1],
318
+ kernel_size=(3, 3),
319
+ stride=(2, 2),
320
+ padding=(1, 1),
321
+ )
322
+ )
323
+ for i in range(K)
324
+ ]
325
+ self.convs = nn.ModuleList(convs)
326
+
327
+ out_channels = self.calculate_channels(spec_channels, 3, 2, 1, K)
328
+ self.gru = nn.GRU(
329
+ input_size=ref_enc_filters[-1] * out_channels,
330
+ hidden_size=256 // 2,
331
+ batch_first=True,
332
+ )
333
+ self.proj = nn.Linear(128, gin_channels)
334
+ if layernorm:
335
+ self.layernorm = nn.LayerNorm(self.spec_channels)
336
+ else:
337
+ self.layernorm = None
338
+
339
+ def forward(self, inputs, mask=None):
340
+ N = inputs.size(0)
341
+
342
+ out = inputs.view(N, 1, -1, self.spec_channels) # [N, 1, Ty, n_freqs]
343
+ if self.layernorm is not None:
344
+ out = self.layernorm(out)
345
+
346
+ for conv in self.convs:
347
+ out = conv(out)
348
+ # out = wn(out)
349
+ out = F.relu(out) # [N, 128, Ty//2^K, n_mels//2^K]
350
+
351
+ out = out.transpose(1, 2) # [N, Ty//2^K, 128, n_mels//2^K]
352
+ T = out.size(1)
353
+ N = out.size(0)
354
+ out = out.contiguous().view(N, T, -1) # [N, Ty//2^K, 128*n_mels//2^K]
355
+
356
+ self.gru.flatten_parameters()
357
+ memory, out = self.gru(out) # out --- [1, N, 128]
358
+
359
+ return self.proj(out.squeeze(0))
360
+
361
+ def calculate_channels(self, L, kernel_size, stride, pad, n_convs):
362
+ for i in range(n_convs):
363
+ L = (L - kernel_size + 2 * pad) // stride + 1
364
+ return L
365
+
366
+
367
+ class ResidualCouplingBlock(nn.Module):
368
+ def __init__(self,
369
+ channels,
370
+ hidden_channels,
371
+ kernel_size,
372
+ dilation_rate,
373
+ n_layers,
374
+ n_flows=4,
375
+ gin_channels=0):
376
+ super().__init__()
377
+ self.channels = channels
378
+ self.hidden_channels = hidden_channels
379
+ self.kernel_size = kernel_size
380
+ self.dilation_rate = dilation_rate
381
+ self.n_layers = n_layers
382
+ self.n_flows = n_flows
383
+ self.gin_channels = gin_channels
384
+
385
+ self.flows = nn.ModuleList()
386
+ for i in range(n_flows):
387
+ self.flows.append(modules.ResidualCouplingLayer(channels, hidden_channels, kernel_size, dilation_rate, n_layers, gin_channels=gin_channels, mean_only=True))
388
+ self.flows.append(modules.Flip())
389
+
390
+ def forward(self, x, x_mask, g=None, reverse=False):
391
+ if not reverse:
392
+ for flow in self.flows:
393
+ x, _ = flow(x, x_mask, g=g, reverse=reverse)
394
+ else:
395
+ for flow in reversed(self.flows):
396
+ x = flow(x, x_mask, g=g, reverse=reverse)
397
+ return x
398
+
399
+ class SynthesizerTrn(nn.Module):
400
+ """
401
+ Synthesizer for Training
402
+ """
403
+
404
+ def __init__(
405
+ self,
406
+ n_vocab,
407
+ spec_channels,
408
+ inter_channels,
409
+ hidden_channels,
410
+ filter_channels,
411
+ n_heads,
412
+ n_layers,
413
+ kernel_size,
414
+ p_dropout,
415
+ resblock,
416
+ resblock_kernel_sizes,
417
+ resblock_dilation_sizes,
418
+ upsample_rates,
419
+ upsample_initial_channel,
420
+ upsample_kernel_sizes,
421
+ n_speakers=256,
422
+ gin_channels=256,
423
+ zero_g=False,
424
+ **kwargs
425
+ ):
426
+ super().__init__()
427
+
428
+ self.dec = Generator(
429
+ inter_channels,
430
+ resblock,
431
+ resblock_kernel_sizes,
432
+ resblock_dilation_sizes,
433
+ upsample_rates,
434
+ upsample_initial_channel,
435
+ upsample_kernel_sizes,
436
+ gin_channels=gin_channels,
437
+ )
438
+ self.enc_q = PosteriorEncoder(
439
+ spec_channels,
440
+ inter_channels,
441
+ hidden_channels,
442
+ 5,
443
+ 1,
444
+ 16,
445
+ gin_channels=gin_channels,
446
+ )
447
+
448
+ self.flow = ResidualCouplingBlock(inter_channels, hidden_channels, 5, 1, 4, gin_channels=gin_channels)
449
+
450
+ self.n_speakers = n_speakers
451
+ if n_speakers == 0:
452
+ self.ref_enc = ReferenceEncoder(spec_channels, gin_channels)
453
+ else:
454
+ self.enc_p = TextEncoder(n_vocab,
455
+ inter_channels,
456
+ hidden_channels,
457
+ filter_channels,
458
+ n_heads,
459
+ n_layers,
460
+ kernel_size,
461
+ p_dropout)
462
+ self.sdp = StochasticDurationPredictor(hidden_channels, 192, 3, 0.5, 4, gin_channels=gin_channels)
463
+ self.dp = DurationPredictor(hidden_channels, 256, 3, 0.5, gin_channels=gin_channels)
464
+ self.emb_g = nn.Embedding(n_speakers, gin_channels)
465
+ self.zero_g = zero_g
466
+
467
+ def infer(self, x, x_lengths, sid=None, noise_scale=1, length_scale=1, noise_scale_w=1., sdp_ratio=0.2, max_len=None):
468
+ x, m_p, logs_p, x_mask = self.enc_p(x, x_lengths)
469
+ if self.n_speakers > 0:
470
+ g = self.emb_g(sid).unsqueeze(-1) # [b, h, 1]
471
+ else:
472
+ g = None
473
+
474
+ logw = self.sdp(x, x_mask, g=g, reverse=True, noise_scale=noise_scale_w) * sdp_ratio \
475
+ + self.dp(x, x_mask, g=g) * (1 - sdp_ratio)
476
+
477
+ w = torch.exp(logw) * x_mask * length_scale
478
+ w_ceil = torch.ceil(w)
479
+ y_lengths = torch.clamp_min(torch.sum(w_ceil, [1, 2]), 1).long()
480
+ y_mask = torch.unsqueeze(commons.sequence_mask(y_lengths, None), 1).to(x_mask.dtype)
481
+ attn_mask = torch.unsqueeze(x_mask, 2) * torch.unsqueeze(y_mask, -1)
482
+ attn = commons.generate_path(w_ceil, attn_mask)
483
+
484
+ m_p = torch.matmul(attn.squeeze(1), m_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
485
+ logs_p = torch.matmul(attn.squeeze(1), logs_p.transpose(1, 2)).transpose(1, 2) # [b, t', t], [b, t, d] -> [b, d, t']
486
+
487
+ z_p = m_p + torch.randn_like(m_p) * torch.exp(logs_p) * noise_scale
488
+ z = self.flow(z_p, y_mask, g=g, reverse=True)
489
+ o = self.dec((z * y_mask)[:,:,:max_len], g=g)
490
+ return o, attn, y_mask, (z, z_p, m_p, logs_p)
491
+
492
+ def voice_conversion(self, y, y_lengths, sid_src, sid_tgt, tau=1.0):
493
+ g_src = sid_src
494
+ g_tgt = sid_tgt
495
+ z, m_q, logs_q, y_mask = self.enc_q(y, y_lengths, g=g_src if not self.zero_g else torch.zeros_like(g_src), tau=tau)
496
+ z_p = self.flow(z, y_mask, g=g_src)
497
+ z_hat = self.flow(z_p, y_mask, g=g_tgt, reverse=True)
498
+ o_hat = self.dec(z_hat * y_mask, g=g_tgt if not self.zero_g else torch.zeros_like(g_tgt))
499
+ return o_hat, y_mask, (z, z_p, z_hat)
dreamvoice/train_utils/src/openvoice/modules.py ADDED
@@ -0,0 +1,598 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import torch
3
+ from torch import nn
4
+ from torch.nn import functional as F
5
+
6
+ from torch.nn import Conv1d
7
+ from torch.nn.utils import weight_norm, remove_weight_norm
8
+
9
+ from openvoice import commons
10
+ from openvoice.commons import init_weights, get_padding
11
+ from openvoice.transforms import piecewise_rational_quadratic_transform
12
+ from openvoice.attentions import Encoder
13
+
14
+ LRELU_SLOPE = 0.1
15
+
16
+
17
+ class LayerNorm(nn.Module):
18
+ def __init__(self, channels, eps=1e-5):
19
+ super().__init__()
20
+ self.channels = channels
21
+ self.eps = eps
22
+
23
+ self.gamma = nn.Parameter(torch.ones(channels))
24
+ self.beta = nn.Parameter(torch.zeros(channels))
25
+
26
+ def forward(self, x):
27
+ x = x.transpose(1, -1)
28
+ x = F.layer_norm(x, (self.channels,), self.gamma, self.beta, self.eps)
29
+ return x.transpose(1, -1)
30
+
31
+
32
+ class ConvReluNorm(nn.Module):
33
+ def __init__(
34
+ self,
35
+ in_channels,
36
+ hidden_channels,
37
+ out_channels,
38
+ kernel_size,
39
+ n_layers,
40
+ p_dropout,
41
+ ):
42
+ super().__init__()
43
+ self.in_channels = in_channels
44
+ self.hidden_channels = hidden_channels
45
+ self.out_channels = out_channels
46
+ self.kernel_size = kernel_size
47
+ self.n_layers = n_layers
48
+ self.p_dropout = p_dropout
49
+ assert n_layers > 1, "Number of layers should be larger than 0."
50
+
51
+ self.conv_layers = nn.ModuleList()
52
+ self.norm_layers = nn.ModuleList()
53
+ self.conv_layers.append(
54
+ nn.Conv1d(
55
+ in_channels, hidden_channels, kernel_size, padding=kernel_size // 2
56
+ )
57
+ )
58
+ self.norm_layers.append(LayerNorm(hidden_channels))
59
+ self.relu_drop = nn.Sequential(nn.ReLU(), nn.Dropout(p_dropout))
60
+ for _ in range(n_layers - 1):
61
+ self.conv_layers.append(
62
+ nn.Conv1d(
63
+ hidden_channels,
64
+ hidden_channels,
65
+ kernel_size,
66
+ padding=kernel_size // 2,
67
+ )
68
+ )
69
+ self.norm_layers.append(LayerNorm(hidden_channels))
70
+ self.proj = nn.Conv1d(hidden_channels, out_channels, 1)
71
+ self.proj.weight.data.zero_()
72
+ self.proj.bias.data.zero_()
73
+
74
+ def forward(self, x, x_mask):
75
+ x_org = x
76
+ for i in range(self.n_layers):
77
+ x = self.conv_layers[i](x * x_mask)
78
+ x = self.norm_layers[i](x)
79
+ x = self.relu_drop(x)
80
+ x = x_org + self.proj(x)
81
+ return x * x_mask
82
+
83
+
84
+ class DDSConv(nn.Module):
85
+ """
86
+ Dilated and Depth-Separable Convolution
87
+ """
88
+
89
+ def __init__(self, channels, kernel_size, n_layers, p_dropout=0.0):
90
+ super().__init__()
91
+ self.channels = channels
92
+ self.kernel_size = kernel_size
93
+ self.n_layers = n_layers
94
+ self.p_dropout = p_dropout
95
+
96
+ self.drop = nn.Dropout(p_dropout)
97
+ self.convs_sep = nn.ModuleList()
98
+ self.convs_1x1 = nn.ModuleList()
99
+ self.norms_1 = nn.ModuleList()
100
+ self.norms_2 = nn.ModuleList()
101
+ for i in range(n_layers):
102
+ dilation = kernel_size**i
103
+ padding = (kernel_size * dilation - dilation) // 2
104
+ self.convs_sep.append(
105
+ nn.Conv1d(
106
+ channels,
107
+ channels,
108
+ kernel_size,
109
+ groups=channels,
110
+ dilation=dilation,
111
+ padding=padding,
112
+ )
113
+ )
114
+ self.convs_1x1.append(nn.Conv1d(channels, channels, 1))
115
+ self.norms_1.append(LayerNorm(channels))
116
+ self.norms_2.append(LayerNorm(channels))
117
+
118
+ def forward(self, x, x_mask, g=None):
119
+ if g is not None:
120
+ x = x + g
121
+ for i in range(self.n_layers):
122
+ y = self.convs_sep[i](x * x_mask)
123
+ y = self.norms_1[i](y)
124
+ y = F.gelu(y)
125
+ y = self.convs_1x1[i](y)
126
+ y = self.norms_2[i](y)
127
+ y = F.gelu(y)
128
+ y = self.drop(y)
129
+ x = x + y
130
+ return x * x_mask
131
+
132
+
133
+ class WN(torch.nn.Module):
134
+ def __init__(
135
+ self,
136
+ hidden_channels,
137
+ kernel_size,
138
+ dilation_rate,
139
+ n_layers,
140
+ gin_channels=0,
141
+ p_dropout=0,
142
+ ):
143
+ super(WN, self).__init__()
144
+ assert kernel_size % 2 == 1
145
+ self.hidden_channels = hidden_channels
146
+ self.kernel_size = (kernel_size,)
147
+ self.dilation_rate = dilation_rate
148
+ self.n_layers = n_layers
149
+ self.gin_channels = gin_channels
150
+ self.p_dropout = p_dropout
151
+
152
+ self.in_layers = torch.nn.ModuleList()
153
+ self.res_skip_layers = torch.nn.ModuleList()
154
+ self.drop = nn.Dropout(p_dropout)
155
+
156
+ if gin_channels != 0:
157
+ cond_layer = torch.nn.Conv1d(
158
+ gin_channels, 2 * hidden_channels * n_layers, 1
159
+ )
160
+ self.cond_layer = torch.nn.utils.weight_norm(cond_layer, name="weight")
161
+
162
+ for i in range(n_layers):
163
+ dilation = dilation_rate**i
164
+ padding = int((kernel_size * dilation - dilation) / 2)
165
+ in_layer = torch.nn.Conv1d(
166
+ hidden_channels,
167
+ 2 * hidden_channels,
168
+ kernel_size,
169
+ dilation=dilation,
170
+ padding=padding,
171
+ )
172
+ in_layer = torch.nn.utils.weight_norm(in_layer, name="weight")
173
+ self.in_layers.append(in_layer)
174
+
175
+ # last one is not necessary
176
+ if i < n_layers - 1:
177
+ res_skip_channels = 2 * hidden_channels
178
+ else:
179
+ res_skip_channels = hidden_channels
180
+
181
+ res_skip_layer = torch.nn.Conv1d(hidden_channels, res_skip_channels, 1)
182
+ res_skip_layer = torch.nn.utils.weight_norm(res_skip_layer, name="weight")
183
+ self.res_skip_layers.append(res_skip_layer)
184
+
185
+ def forward(self, x, x_mask, g=None, **kwargs):
186
+ output = torch.zeros_like(x)
187
+ n_channels_tensor = torch.IntTensor([self.hidden_channels])
188
+
189
+ if g is not None:
190
+ g = self.cond_layer(g)
191
+
192
+ for i in range(self.n_layers):
193
+ x_in = self.in_layers[i](x)
194
+ if g is not None:
195
+ cond_offset = i * 2 * self.hidden_channels
196
+ g_l = g[:, cond_offset : cond_offset + 2 * self.hidden_channels, :]
197
+ else:
198
+ g_l = torch.zeros_like(x_in)
199
+
200
+ acts = commons.fused_add_tanh_sigmoid_multiply(x_in, g_l, n_channels_tensor)
201
+ acts = self.drop(acts)
202
+
203
+ res_skip_acts = self.res_skip_layers[i](acts)
204
+ if i < self.n_layers - 1:
205
+ res_acts = res_skip_acts[:, : self.hidden_channels, :]
206
+ x = (x + res_acts) * x_mask
207
+ output = output + res_skip_acts[:, self.hidden_channels :, :]
208
+ else:
209
+ output = output + res_skip_acts
210
+ return output * x_mask
211
+
212
+ def remove_weight_norm(self):
213
+ if self.gin_channels != 0:
214
+ torch.nn.utils.remove_weight_norm(self.cond_layer)
215
+ for l in self.in_layers:
216
+ torch.nn.utils.remove_weight_norm(l)
217
+ for l in self.res_skip_layers:
218
+ torch.nn.utils.remove_weight_norm(l)
219
+
220
+
221
+ class ResBlock1(torch.nn.Module):
222
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5)):
223
+ super(ResBlock1, self).__init__()
224
+ self.convs1 = nn.ModuleList(
225
+ [
226
+ weight_norm(
227
+ Conv1d(
228
+ channels,
229
+ channels,
230
+ kernel_size,
231
+ 1,
232
+ dilation=dilation[0],
233
+ padding=get_padding(kernel_size, dilation[0]),
234
+ )
235
+ ),
236
+ weight_norm(
237
+ Conv1d(
238
+ channels,
239
+ channels,
240
+ kernel_size,
241
+ 1,
242
+ dilation=dilation[1],
243
+ padding=get_padding(kernel_size, dilation[1]),
244
+ )
245
+ ),
246
+ weight_norm(
247
+ Conv1d(
248
+ channels,
249
+ channels,
250
+ kernel_size,
251
+ 1,
252
+ dilation=dilation[2],
253
+ padding=get_padding(kernel_size, dilation[2]),
254
+ )
255
+ ),
256
+ ]
257
+ )
258
+ self.convs1.apply(init_weights)
259
+
260
+ self.convs2 = nn.ModuleList(
261
+ [
262
+ weight_norm(
263
+ Conv1d(
264
+ channels,
265
+ channels,
266
+ kernel_size,
267
+ 1,
268
+ dilation=1,
269
+ padding=get_padding(kernel_size, 1),
270
+ )
271
+ ),
272
+ weight_norm(
273
+ Conv1d(
274
+ channels,
275
+ channels,
276
+ kernel_size,
277
+ 1,
278
+ dilation=1,
279
+ padding=get_padding(kernel_size, 1),
280
+ )
281
+ ),
282
+ weight_norm(
283
+ Conv1d(
284
+ channels,
285
+ channels,
286
+ kernel_size,
287
+ 1,
288
+ dilation=1,
289
+ padding=get_padding(kernel_size, 1),
290
+ )
291
+ ),
292
+ ]
293
+ )
294
+ self.convs2.apply(init_weights)
295
+
296
+ def forward(self, x, x_mask=None):
297
+ for c1, c2 in zip(self.convs1, self.convs2):
298
+ xt = F.leaky_relu(x, LRELU_SLOPE)
299
+ if x_mask is not None:
300
+ xt = xt * x_mask
301
+ xt = c1(xt)
302
+ xt = F.leaky_relu(xt, LRELU_SLOPE)
303
+ if x_mask is not None:
304
+ xt = xt * x_mask
305
+ xt = c2(xt)
306
+ x = xt + x
307
+ if x_mask is not None:
308
+ x = x * x_mask
309
+ return x
310
+
311
+ def remove_weight_norm(self):
312
+ for l in self.convs1:
313
+ remove_weight_norm(l)
314
+ for l in self.convs2:
315
+ remove_weight_norm(l)
316
+
317
+
318
+ class ResBlock2(torch.nn.Module):
319
+ def __init__(self, channels, kernel_size=3, dilation=(1, 3)):
320
+ super(ResBlock2, self).__init__()
321
+ self.convs = nn.ModuleList(
322
+ [
323
+ weight_norm(
324
+ Conv1d(
325
+ channels,
326
+ channels,
327
+ kernel_size,
328
+ 1,
329
+ dilation=dilation[0],
330
+ padding=get_padding(kernel_size, dilation[0]),
331
+ )
332
+ ),
333
+ weight_norm(
334
+ Conv1d(
335
+ channels,
336
+ channels,
337
+ kernel_size,
338
+ 1,
339
+ dilation=dilation[1],
340
+ padding=get_padding(kernel_size, dilation[1]),
341
+ )
342
+ ),
343
+ ]
344
+ )
345
+ self.convs.apply(init_weights)
346
+
347
+ def forward(self, x, x_mask=None):
348
+ for c in self.convs:
349
+ xt = F.leaky_relu(x, LRELU_SLOPE)
350
+ if x_mask is not None:
351
+ xt = xt * x_mask
352
+ xt = c(xt)
353
+ x = xt + x
354
+ if x_mask is not None:
355
+ x = x * x_mask
356
+ return x
357
+
358
+ def remove_weight_norm(self):
359
+ for l in self.convs:
360
+ remove_weight_norm(l)
361
+
362
+
363
+ class Log(nn.Module):
364
+ def forward(self, x, x_mask, reverse=False, **kwargs):
365
+ if not reverse:
366
+ y = torch.log(torch.clamp_min(x, 1e-5)) * x_mask
367
+ logdet = torch.sum(-y, [1, 2])
368
+ return y, logdet
369
+ else:
370
+ x = torch.exp(x) * x_mask
371
+ return x
372
+
373
+
374
+ class Flip(nn.Module):
375
+ def forward(self, x, *args, reverse=False, **kwargs):
376
+ x = torch.flip(x, [1])
377
+ if not reverse:
378
+ logdet = torch.zeros(x.size(0)).to(dtype=x.dtype, device=x.device)
379
+ return x, logdet
380
+ else:
381
+ return x
382
+
383
+
384
+ class ElementwiseAffine(nn.Module):
385
+ def __init__(self, channels):
386
+ super().__init__()
387
+ self.channels = channels
388
+ self.m = nn.Parameter(torch.zeros(channels, 1))
389
+ self.logs = nn.Parameter(torch.zeros(channels, 1))
390
+
391
+ def forward(self, x, x_mask, reverse=False, **kwargs):
392
+ if not reverse:
393
+ y = self.m + torch.exp(self.logs) * x
394
+ y = y * x_mask
395
+ logdet = torch.sum(self.logs * x_mask, [1, 2])
396
+ return y, logdet
397
+ else:
398
+ x = (x - self.m) * torch.exp(-self.logs) * x_mask
399
+ return x
400
+
401
+
402
+ class ResidualCouplingLayer(nn.Module):
403
+ def __init__(
404
+ self,
405
+ channels,
406
+ hidden_channels,
407
+ kernel_size,
408
+ dilation_rate,
409
+ n_layers,
410
+ p_dropout=0,
411
+ gin_channels=0,
412
+ mean_only=False,
413
+ ):
414
+ assert channels % 2 == 0, "channels should be divisible by 2"
415
+ super().__init__()
416
+ self.channels = channels
417
+ self.hidden_channels = hidden_channels
418
+ self.kernel_size = kernel_size
419
+ self.dilation_rate = dilation_rate
420
+ self.n_layers = n_layers
421
+ self.half_channels = channels // 2
422
+ self.mean_only = mean_only
423
+
424
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
425
+ self.enc = WN(
426
+ hidden_channels,
427
+ kernel_size,
428
+ dilation_rate,
429
+ n_layers,
430
+ p_dropout=p_dropout,
431
+ gin_channels=gin_channels,
432
+ )
433
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
434
+ self.post.weight.data.zero_()
435
+ self.post.bias.data.zero_()
436
+
437
+ def forward(self, x, x_mask, g=None, reverse=False):
438
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
439
+ h = self.pre(x0) * x_mask
440
+ h = self.enc(h, x_mask, g=g)
441
+ stats = self.post(h) * x_mask
442
+ if not self.mean_only:
443
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
444
+ else:
445
+ m = stats
446
+ logs = torch.zeros_like(m)
447
+
448
+ if not reverse:
449
+ x1 = m + x1 * torch.exp(logs) * x_mask
450
+ x = torch.cat([x0, x1], 1)
451
+ logdet = torch.sum(logs, [1, 2])
452
+ return x, logdet
453
+ else:
454
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
455
+ x = torch.cat([x0, x1], 1)
456
+ return x
457
+
458
+
459
+ class ConvFlow(nn.Module):
460
+ def __init__(
461
+ self,
462
+ in_channels,
463
+ filter_channels,
464
+ kernel_size,
465
+ n_layers,
466
+ num_bins=10,
467
+ tail_bound=5.0,
468
+ ):
469
+ super().__init__()
470
+ self.in_channels = in_channels
471
+ self.filter_channels = filter_channels
472
+ self.kernel_size = kernel_size
473
+ self.n_layers = n_layers
474
+ self.num_bins = num_bins
475
+ self.tail_bound = tail_bound
476
+ self.half_channels = in_channels // 2
477
+
478
+ self.pre = nn.Conv1d(self.half_channels, filter_channels, 1)
479
+ self.convs = DDSConv(filter_channels, kernel_size, n_layers, p_dropout=0.0)
480
+ self.proj = nn.Conv1d(
481
+ filter_channels, self.half_channels * (num_bins * 3 - 1), 1
482
+ )
483
+ self.proj.weight.data.zero_()
484
+ self.proj.bias.data.zero_()
485
+
486
+ def forward(self, x, x_mask, g=None, reverse=False):
487
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
488
+ h = self.pre(x0)
489
+ h = self.convs(h, x_mask, g=g)
490
+ h = self.proj(h) * x_mask
491
+
492
+ b, c, t = x0.shape
493
+ h = h.reshape(b, c, -1, t).permute(0, 1, 3, 2) # [b, cx?, t] -> [b, c, t, ?]
494
+
495
+ unnormalized_widths = h[..., : self.num_bins] / math.sqrt(self.filter_channels)
496
+ unnormalized_heights = h[..., self.num_bins : 2 * self.num_bins] / math.sqrt(
497
+ self.filter_channels
498
+ )
499
+ unnormalized_derivatives = h[..., 2 * self.num_bins :]
500
+
501
+ x1, logabsdet = piecewise_rational_quadratic_transform(
502
+ x1,
503
+ unnormalized_widths,
504
+ unnormalized_heights,
505
+ unnormalized_derivatives,
506
+ inverse=reverse,
507
+ tails="linear",
508
+ tail_bound=self.tail_bound,
509
+ )
510
+
511
+ x = torch.cat([x0, x1], 1) * x_mask
512
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
513
+ if not reverse:
514
+ return x, logdet
515
+ else:
516
+ return x
517
+
518
+
519
+ class TransformerCouplingLayer(nn.Module):
520
+ def __init__(
521
+ self,
522
+ channels,
523
+ hidden_channels,
524
+ kernel_size,
525
+ n_layers,
526
+ n_heads,
527
+ p_dropout=0,
528
+ filter_channels=0,
529
+ mean_only=False,
530
+ wn_sharing_parameter=None,
531
+ gin_channels=0,
532
+ ):
533
+ assert n_layers == 3, n_layers
534
+ assert channels % 2 == 0, "channels should be divisible by 2"
535
+ super().__init__()
536
+ self.channels = channels
537
+ self.hidden_channels = hidden_channels
538
+ self.kernel_size = kernel_size
539
+ self.n_layers = n_layers
540
+ self.half_channels = channels // 2
541
+ self.mean_only = mean_only
542
+
543
+ self.pre = nn.Conv1d(self.half_channels, hidden_channels, 1)
544
+ self.enc = (
545
+ Encoder(
546
+ hidden_channels,
547
+ filter_channels,
548
+ n_heads,
549
+ n_layers,
550
+ kernel_size,
551
+ p_dropout,
552
+ isflow=True,
553
+ gin_channels=gin_channels,
554
+ )
555
+ if wn_sharing_parameter is None
556
+ else wn_sharing_parameter
557
+ )
558
+ self.post = nn.Conv1d(hidden_channels, self.half_channels * (2 - mean_only), 1)
559
+ self.post.weight.data.zero_()
560
+ self.post.bias.data.zero_()
561
+
562
+ def forward(self, x, x_mask, g=None, reverse=False):
563
+ x0, x1 = torch.split(x, [self.half_channels] * 2, 1)
564
+ h = self.pre(x0) * x_mask
565
+ h = self.enc(h, x_mask, g=g)
566
+ stats = self.post(h) * x_mask
567
+ if not self.mean_only:
568
+ m, logs = torch.split(stats, [self.half_channels] * 2, 1)
569
+ else:
570
+ m = stats
571
+ logs = torch.zeros_like(m)
572
+
573
+ if not reverse:
574
+ x1 = m + x1 * torch.exp(logs) * x_mask
575
+ x = torch.cat([x0, x1], 1)
576
+ logdet = torch.sum(logs, [1, 2])
577
+ return x, logdet
578
+ else:
579
+ x1 = (x1 - m) * torch.exp(-logs) * x_mask
580
+ x = torch.cat([x0, x1], 1)
581
+ return x
582
+
583
+ x1, logabsdet = piecewise_rational_quadratic_transform(
584
+ x1,
585
+ unnormalized_widths,
586
+ unnormalized_heights,
587
+ unnormalized_derivatives,
588
+ inverse=reverse,
589
+ tails="linear",
590
+ tail_bound=self.tail_bound,
591
+ )
592
+
593
+ x = torch.cat([x0, x1], 1) * x_mask
594
+ logdet = torch.sum(logabsdet * x_mask, [1, 2])
595
+ if not reverse:
596
+ return x, logdet
597
+ else:
598
+ return x
dreamvoice/train_utils/src/openvoice/openvoice_app.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import torch
3
+ import argparse
4
+ import gradio as gr
5
+ from zipfile import ZipFile
6
+ import langid
7
+ from openvoice import se_extractor
8
+ from openvoice.api import BaseSpeakerTTS, ToneColorConverter
9
+
10
+ parser = argparse.ArgumentParser()
11
+ parser.add_argument("--share", action='store_true', default=False, help="make link public")
12
+ args = parser.parse_args()
13
+
14
+ en_ckpt_base = 'checkpoints/base_speakers/EN'
15
+ zh_ckpt_base = 'checkpoints/base_speakers/ZH'
16
+ ckpt_converter = 'checkpoints/converter'
17
+ device = 'cuda' if torch.cuda.is_available() else 'cpu'
18
+ output_dir = 'outputs'
19
+ os.makedirs(output_dir, exist_ok=True)
20
+
21
+ # load models
22
+ en_base_speaker_tts = BaseSpeakerTTS(f'{en_ckpt_base}/config.json', device=device)
23
+ en_base_speaker_tts.load_ckpt(f'{en_ckpt_base}/checkpoint.pth')
24
+ zh_base_speaker_tts = BaseSpeakerTTS(f'{zh_ckpt_base}/config.json', device=device)
25
+ zh_base_speaker_tts.load_ckpt(f'{zh_ckpt_base}/checkpoint.pth')
26
+ tone_color_converter = ToneColorConverter(f'{ckpt_converter}/config.json', device=device)
27
+ tone_color_converter.load_ckpt(f'{ckpt_converter}/checkpoint.pth')
28
+
29
+ # load speaker embeddings
30
+ en_source_default_se = torch.load(f'{en_ckpt_base}/en_default_se.pth').to(device)
31
+ en_source_style_se = torch.load(f'{en_ckpt_base}/en_style_se.pth').to(device)
32
+ zh_source_se = torch.load(f'{zh_ckpt_base}/zh_default_se.pth').to(device)
33
+
34
+ # This online demo mainly supports English and Chinese
35
+ supported_languages = ['zh', 'en']
36
+
37
+ def predict(prompt, style, audio_file_pth, agree):
38
+ # initialize a empty info
39
+ text_hint = ''
40
+ # agree with the terms
41
+ if agree == False:
42
+ text_hint += '[ERROR] Please accept the Terms & Condition!\n'
43
+ gr.Warning("Please accept the Terms & Condition!")
44
+ return (
45
+ text_hint,
46
+ None,
47
+ None,
48
+ )
49
+
50
+ # first detect the input language
51
+ language_predicted = langid.classify(prompt)[0].strip()
52
+ print(f"Detected language:{language_predicted}")
53
+
54
+ if language_predicted not in supported_languages:
55
+ text_hint += f"[ERROR] The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}\n"
56
+ gr.Warning(
57
+ f"The detected language {language_predicted} for your input text is not in our Supported Languages: {supported_languages}"
58
+ )
59
+
60
+ return (
61
+ text_hint,
62
+ None,
63
+ None,
64
+ )
65
+
66
+ if language_predicted == "zh":
67
+ tts_model = zh_base_speaker_tts
68
+ source_se = zh_source_se
69
+ language = 'Chinese'
70
+ if style not in ['default']:
71
+ text_hint += f"[ERROR] The style {style} is not supported for Chinese, which should be in ['default']\n"
72
+ gr.Warning(f"The style {style} is not supported for Chinese, which should be in ['default']")
73
+ return (
74
+ text_hint,
75
+ None,
76
+ None,
77
+ )
78
+
79
+ else:
80
+ tts_model = en_base_speaker_tts
81
+ if style == 'default':
82
+ source_se = en_source_default_se
83
+ else:
84
+ source_se = en_source_style_se
85
+ language = 'English'
86
+ if style not in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']:
87
+ text_hint += f"[ERROR] The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']\n"
88
+ gr.Warning(f"The style {style} is not supported for English, which should be in ['default', 'whispering', 'shouting', 'excited', 'cheerful', 'terrified', 'angry', 'sad', 'friendly']")
89
+ return (
90
+ text_hint,
91
+ None,
92
+ None,
93
+ )
94
+
95
+ speaker_wav = audio_file_pth
96
+
97
+ if len(prompt) < 2:
98
+ text_hint += f"[ERROR] Please give a longer prompt text \n"
99
+ gr.Warning("Please give a longer prompt text")
100
+ return (
101
+ text_hint,
102
+ None,
103
+ None,
104
+ )
105
+ if len(prompt) > 200:
106
+ text_hint += f"[ERROR] Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo and try for your usage \n"
107
+ gr.Warning(
108
+ "Text length limited to 200 characters for this demo, please try shorter text. You can clone our open-source repo for your usage"
109
+ )
110
+ return (
111
+ text_hint,
112
+ None,
113
+ None,
114
+ )
115
+
116
+ # note diffusion_conditioning not used on hifigan (default mode), it will be empty but need to pass it to model.inference
117
+ try:
118
+ target_se, audio_name = se_extractor.get_se(speaker_wav, tone_color_converter, target_dir='processed', vad=True)
119
+ except Exception as e:
120
+ text_hint += f"[ERROR] Get target tone color error {str(e)} \n"
121
+ gr.Warning(
122
+ "[ERROR] Get target tone color error {str(e)} \n"
123
+ )
124
+ return (
125
+ text_hint,
126
+ None,
127
+ None,
128
+ )
129
+
130
+ src_path = f'{output_dir}/tmp.wav'
131
+ tts_model.tts(prompt, src_path, speaker=style, language=language)
132
+
133
+ save_path = f'{output_dir}/output.wav'
134
+ # Run the tone color converter
135
+ encode_message = "@MyShell"
136
+ tone_color_converter.convert(
137
+ audio_src_path=src_path,
138
+ src_se=source_se,
139
+ tgt_se=target_se,
140
+ output_path=save_path,
141
+ message=encode_message)
142
+
143
+ text_hint += f'''Get response successfully \n'''
144
+
145
+ return (
146
+ text_hint,
147
+ save_path,
148
+ speaker_wav,
149
+ )
150
+
151
+
152
+
153
+ title = "MyShell OpenVoice"
154
+
155
+ description = """
156
+ We introduce OpenVoice, a versatile instant voice cloning approach that requires only a short audio clip from the reference speaker to replicate their voice and generate speech in multiple languages. OpenVoice enables granular control over voice styles, including emotion, accent, rhythm, pauses, and intonation, in addition to replicating the tone color of the reference speaker. OpenVoice also achieves zero-shot cross-lingual voice cloning for languages not included in the massive-speaker training set.
157
+ """
158
+
159
+ markdown_table = """
160
+ <div align="center" style="margin-bottom: 10px;">
161
+
162
+ | | | |
163
+ | :-----------: | :-----------: | :-----------: |
164
+ | **OpenSource Repo** | **Project Page** | **Join the Community** |
165
+ | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | [OpenVoice](https://research.myshell.ai/open-voice) | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
166
+
167
+ </div>
168
+ """
169
+
170
+ markdown_table_v2 = """
171
+ <div align="center" style="margin-bottom: 2px;">
172
+
173
+ | | | | |
174
+ | :-----------: | :-----------: | :-----------: | :-----------: |
175
+ | **OpenSource Repo** | <div style='text-align: center;'><a style="display:inline-block,align:center" href='https://github.com/myshell-ai/OpenVoice'><img src='https://img.shields.io/github/stars/myshell-ai/OpenVoice?style=social' /></a></div> | **Project Page** | [OpenVoice](https://research.myshell.ai/open-voice) |
176
+
177
+ | | |
178
+ | :-----------: | :-----------: |
179
+ **Join the Community** | [![Discord](https://img.shields.io/discord/1122227993805336617?color=%239B59B6&label=%20Discord%20)](https://discord.gg/myshell) |
180
+
181
+ </div>
182
+ """
183
+ content = """
184
+ <div>
185
+ <strong>If the generated voice does not sound like the reference voice, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/docs/QA.md'>this QnA</a>.</strong> <strong>For multi-lingual & cross-lingual examples, please refer to <a href='https://github.com/myshell-ai/OpenVoice/blob/main/demo_part2.ipynb'>this jupyter notebook</a>.</strong>
186
+ This online demo mainly supports <strong>English</strong>. The <em>default</em> style also supports <strong>Chinese</strong>. But OpenVoice can adapt to any other language as long as a base speaker is provided.
187
+ </div>
188
+ """
189
+ wrapped_markdown_content = f"<div style='border: 1px solid #000; padding: 10px;'>{content}</div>"
190
+
191
+
192
+ examples = [
193
+ [
194
+ "今天天气真好,我们一起出去吃饭吧。",
195
+ 'default',
196
+ "resources/demo_speaker1.mp3",
197
+ True,
198
+ ],[
199
+ "This audio is generated by open voice with a half-performance model.",
200
+ 'whispering',
201
+ "resources/demo_speaker2.mp3",
202
+ True,
203
+ ],
204
+ [
205
+ "He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
206
+ 'sad',
207
+ "resources/demo_speaker0.mp3",
208
+ True,
209
+ ],
210
+ ]
211
+
212
+ with gr.Blocks(analytics_enabled=False) as demo:
213
+
214
+ with gr.Row():
215
+ with gr.Column():
216
+ with gr.Row():
217
+ gr.Markdown(
218
+ """
219
+ ## <img src="https://huggingface.co/spaces/myshell-ai/OpenVoice/raw/main/logo.jpg" height="40"/>
220
+ """
221
+ )
222
+ with gr.Row():
223
+ gr.Markdown(markdown_table_v2)
224
+ with gr.Row():
225
+ gr.Markdown(description)
226
+ with gr.Column():
227
+ gr.Video('https://github.com/myshell-ai/OpenVoice/assets/40556743/3cba936f-82bf-476c-9e52-09f0f417bb2f', autoplay=True)
228
+
229
+ with gr.Row():
230
+ gr.HTML(wrapped_markdown_content)
231
+
232
+ with gr.Row():
233
+ with gr.Column():
234
+ input_text_gr = gr.Textbox(
235
+ label="Text Prompt",
236
+ info="One or two sentences at a time is better. Up to 200 text characters.",
237
+ value="He hoped there would be stew for dinner, turnips and carrots and bruised potatoes and fat mutton pieces to be ladled out in thick, peppered, flour-fattened sauce.",
238
+ )
239
+ style_gr = gr.Dropdown(
240
+ label="Style",
241
+ info="Select a style of output audio for the synthesised speech. (Chinese only support 'default' now)",
242
+ choices=['default', 'whispering', 'cheerful', 'terrified', 'angry', 'sad', 'friendly'],
243
+ max_choices=1,
244
+ value="default",
245
+ )
246
+ ref_gr = gr.Audio(
247
+ label="Reference Audio",
248
+ info="Click on the ✎ button to upload your own target speaker audio",
249
+ type="filepath",
250
+ value="resources/demo_speaker2.mp3",
251
+ )
252
+ tos_gr = gr.Checkbox(
253
+ label="Agree",
254
+ value=False,
255
+ info="I agree to the terms of the cc-by-nc-4.0 license-: https://github.com/myshell-ai/OpenVoice/blob/main/LICENSE",
256
+ )
257
+
258
+ tts_button = gr.Button("Send", elem_id="send-btn", visible=True)
259
+
260
+
261
+ with gr.Column():
262
+ out_text_gr = gr.Text(label="Info")
263
+ audio_gr = gr.Audio(label="Synthesised Audio", autoplay=True)
264
+ ref_audio_gr = gr.Audio(label="Reference Audio Used")
265
+
266
+ gr.Examples(examples,
267
+ label="Examples",
268
+ inputs=[input_text_gr, style_gr, ref_gr, tos_gr],
269
+ outputs=[out_text_gr, audio_gr, ref_audio_gr],
270
+ fn=predict,
271
+ cache_examples=False,)
272
+ tts_button.click(predict, [input_text_gr, style_gr, ref_gr, tos_gr], outputs=[out_text_gr, audio_gr, ref_audio_gr])
273
+
274
+ demo.queue()
275
+ demo.launch(debug=True, show_api=True, share=args.share)
dreamvoice/train_utils/src/openvoice/se_extractor.py ADDED
@@ -0,0 +1,153 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import glob
3
+ import torch
4
+ import hashlib
5
+ import librosa
6
+ import base64
7
+ from glob import glob
8
+ import numpy as np
9
+ from pydub import AudioSegment
10
+ from faster_whisper import WhisperModel
11
+ import hashlib
12
+ import base64
13
+ import librosa
14
+ from whisper_timestamped.transcribe import get_audio_tensor, get_vad_segments
15
+
16
+ model_size = "medium"
17
+ # Run on GPU with FP16
18
+ model = None
19
+ def split_audio_whisper(audio_path, audio_name, target_dir='processed'):
20
+ global model
21
+ if model is None:
22
+ model = WhisperModel(model_size, device="cuda", compute_type="float16")
23
+ audio = AudioSegment.from_file(audio_path)
24
+ max_len = len(audio)
25
+
26
+ target_folder = os.path.join(target_dir, audio_name)
27
+
28
+ segments, info = model.transcribe(audio_path, beam_size=5, word_timestamps=True)
29
+ segments = list(segments)
30
+
31
+ # create directory
32
+ os.makedirs(target_folder, exist_ok=True)
33
+ wavs_folder = os.path.join(target_folder, 'wavs')
34
+ os.makedirs(wavs_folder, exist_ok=True)
35
+
36
+ # segments
37
+ s_ind = 0
38
+ start_time = None
39
+
40
+ for k, w in enumerate(segments):
41
+ # process with the time
42
+ if k == 0:
43
+ start_time = max(0, w.start)
44
+
45
+ end_time = w.end
46
+
47
+ # calculate confidence
48
+ if len(w.words) > 0:
49
+ confidence = sum([s.probability for s in w.words]) / len(w.words)
50
+ else:
51
+ confidence = 0.
52
+ # clean text
53
+ text = w.text.replace('...', '')
54
+
55
+ # left 0.08s for each audios
56
+ audio_seg = audio[int( start_time * 1000) : min(max_len, int(end_time * 1000) + 80)]
57
+
58
+ # segment file name
59
+ fname = f"{audio_name}_seg{s_ind}.wav"
60
+
61
+ # filter out the segment shorter than 1.5s and longer than 20s
62
+ save = audio_seg.duration_seconds > 1.5 and \
63
+ audio_seg.duration_seconds < 20. and \
64
+ len(text) >= 2 and len(text) < 200
65
+
66
+ if save:
67
+ output_file = os.path.join(wavs_folder, fname)
68
+ audio_seg.export(output_file, format='wav')
69
+
70
+ if k < len(segments) - 1:
71
+ start_time = max(0, segments[k+1].start - 0.08)
72
+
73
+ s_ind = s_ind + 1
74
+ return wavs_folder
75
+
76
+
77
+ def split_audio_vad(audio_path, audio_name, target_dir, split_seconds=10.0):
78
+ SAMPLE_RATE = 16000
79
+ audio_vad = get_audio_tensor(audio_path)
80
+ segments = get_vad_segments(
81
+ audio_vad,
82
+ output_sample=True,
83
+ min_speech_duration=0.1,
84
+ min_silence_duration=1,
85
+ method="silero",
86
+ )
87
+ segments = [(seg["start"], seg["end"]) for seg in segments]
88
+ segments = [(float(s) / SAMPLE_RATE, float(e) / SAMPLE_RATE) for s,e in segments]
89
+ print(segments)
90
+ audio_active = AudioSegment.silent(duration=0)
91
+ audio = AudioSegment.from_file(audio_path)
92
+
93
+ for start_time, end_time in segments:
94
+ audio_active += audio[int( start_time * 1000) : int(end_time * 1000)]
95
+
96
+ audio_dur = audio_active.duration_seconds
97
+ print(f'after vad: dur = {audio_dur}')
98
+ target_folder = os.path.join(target_dir, audio_name)
99
+ wavs_folder = os.path.join(target_folder, 'wavs')
100
+ os.makedirs(wavs_folder, exist_ok=True)
101
+ start_time = 0.
102
+ count = 0
103
+ num_splits = int(np.round(audio_dur / split_seconds))
104
+ assert num_splits > 0, 'input audio is too short'
105
+ interval = audio_dur / num_splits
106
+
107
+ for i in range(num_splits):
108
+ end_time = min(start_time + interval, audio_dur)
109
+ if i == num_splits - 1:
110
+ end_time = audio_dur
111
+ output_file = f"{wavs_folder}/{audio_name}_seg{count}.wav"
112
+ audio_seg = audio_active[int(start_time * 1000): int(end_time * 1000)]
113
+ audio_seg.export(output_file, format='wav')
114
+ start_time = end_time
115
+ count += 1
116
+ return wavs_folder
117
+
118
+ def hash_numpy_array(audio_path):
119
+ array, _ = librosa.load(audio_path, sr=None, mono=True)
120
+ # Convert the array to bytes
121
+ array_bytes = array.tobytes()
122
+ # Calculate the hash of the array bytes
123
+ hash_object = hashlib.sha256(array_bytes)
124
+ hash_value = hash_object.digest()
125
+ # Convert the hash value to base64
126
+ base64_value = base64.b64encode(hash_value)
127
+ return base64_value.decode('utf-8')[:16].replace('/', '_^')
128
+
129
+ def get_se(audio_path, vc_model, target_dir='processed', vad=True):
130
+ device = vc_model.device
131
+ version = vc_model.version
132
+ print("OpenVoice version:", version)
133
+
134
+ audio_name = f"{os.path.basename(audio_path).rsplit('.', 1)[0]}_{version}_{hash_numpy_array(audio_path)}"
135
+ se_path = os.path.join(target_dir, audio_name, 'se.pth')
136
+
137
+ # if os.path.isfile(se_path):
138
+ # se = torch.load(se_path).to(device)
139
+ # return se, audio_name
140
+ # if os.path.isdir(audio_path):
141
+ # wavs_folder = audio_path
142
+
143
+ if vad:
144
+ wavs_folder = split_audio_vad(audio_path, target_dir=target_dir, audio_name=audio_name)
145
+ else:
146
+ wavs_folder = split_audio_whisper(audio_path, target_dir=target_dir, audio_name=audio_name)
147
+
148
+ audio_segs = glob(f'{wavs_folder}/*.wav')
149
+ if len(audio_segs) == 0:
150
+ raise NotImplementedError('No audio segments found!')
151
+
152
+ return vc_model.extract_se(audio_segs, se_save_path=se_path), audio_name
153
+
dreamvoice/train_utils/src/openvoice/text/__init__.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+ from openvoice.text import cleaners
3
+ from openvoice.text.symbols import symbols
4
+
5
+
6
+ # Mappings from symbol to numeric ID and vice versa:
7
+ _symbol_to_id = {s: i for i, s in enumerate(symbols)}
8
+ _id_to_symbol = {i: s for i, s in enumerate(symbols)}
9
+
10
+
11
+ def text_to_sequence(text, symbols, cleaner_names):
12
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
13
+ Args:
14
+ text: string to convert to a sequence
15
+ cleaner_names: names of the cleaner functions to run the text through
16
+ Returns:
17
+ List of integers corresponding to the symbols in the text
18
+ '''
19
+ sequence = []
20
+ symbol_to_id = {s: i for i, s in enumerate(symbols)}
21
+ clean_text = _clean_text(text, cleaner_names)
22
+ print(clean_text)
23
+ print(f" length:{len(clean_text)}")
24
+ for symbol in clean_text:
25
+ if symbol not in symbol_to_id.keys():
26
+ continue
27
+ symbol_id = symbol_to_id[symbol]
28
+ sequence += [symbol_id]
29
+ print(f" length:{len(sequence)}")
30
+ return sequence
31
+
32
+
33
+ def cleaned_text_to_sequence(cleaned_text, symbols):
34
+ '''Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
35
+ Args:
36
+ text: string to convert to a sequence
37
+ Returns:
38
+ List of integers corresponding to the symbols in the text
39
+ '''
40
+ symbol_to_id = {s: i for i, s in enumerate(symbols)}
41
+ sequence = [symbol_to_id[symbol] for symbol in cleaned_text if symbol in symbol_to_id.keys()]
42
+ return sequence
43
+
44
+
45
+
46
+ from openvoice.text.symbols import language_tone_start_map
47
+ def cleaned_text_to_sequence_vits2(cleaned_text, tones, language, symbols, languages):
48
+ """Converts a string of text to a sequence of IDs corresponding to the symbols in the text.
49
+ Args:
50
+ text: string to convert to a sequence
51
+ Returns:
52
+ List of integers corresponding to the symbols in the text
53
+ """
54
+ symbol_to_id = {s: i for i, s in enumerate(symbols)}
55
+ language_id_map = {s: i for i, s in enumerate(languages)}
56
+ phones = [symbol_to_id[symbol] for symbol in cleaned_text]
57
+ tone_start = language_tone_start_map[language]
58
+ tones = [i + tone_start for i in tones]
59
+ lang_id = language_id_map[language]
60
+ lang_ids = [lang_id for i in phones]
61
+ return phones, tones, lang_ids
62
+
63
+
64
+ def sequence_to_text(sequence):
65
+ '''Converts a sequence of IDs back to a string'''
66
+ result = ''
67
+ for symbol_id in sequence:
68
+ s = _id_to_symbol[symbol_id]
69
+ result += s
70
+ return result
71
+
72
+
73
+ def _clean_text(text, cleaner_names):
74
+ for name in cleaner_names:
75
+ cleaner = getattr(cleaners, name)
76
+ if not cleaner:
77
+ raise Exception('Unknown cleaner: %s' % name)
78
+ text = cleaner(text)
79
+ return text
dreamvoice/train_utils/src/openvoice/text/cleaners.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from openvoice.text.english import english_to_lazy_ipa, english_to_ipa2, english_to_lazy_ipa2
3
+ from openvoice.text.mandarin import number_to_chinese, chinese_to_bopomofo, latin_to_bopomofo, chinese_to_romaji, chinese_to_lazy_ipa, chinese_to_ipa, chinese_to_ipa2
4
+
5
+ def cjke_cleaners2(text):
6
+ text = re.sub(r'\[ZH\](.*?)\[ZH\]',
7
+ lambda x: chinese_to_ipa(x.group(1))+' ', text)
8
+ text = re.sub(r'\[JA\](.*?)\[JA\]',
9
+ lambda x: japanese_to_ipa2(x.group(1))+' ', text)
10
+ text = re.sub(r'\[KO\](.*?)\[KO\]',
11
+ lambda x: korean_to_ipa(x.group(1))+' ', text)
12
+ text = re.sub(r'\[EN\](.*?)\[EN\]',
13
+ lambda x: english_to_ipa2(x.group(1))+' ', text)
14
+ text = re.sub(r'\s+$', '', text)
15
+ text = re.sub(r'([^\.,!\?\-…~])$', r'\1.', text)
16
+ return text
dreamvoice/train_utils/src/openvoice/text/english.py ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """ from https://github.com/keithito/tacotron """
2
+
3
+ '''
4
+ Cleaners are transformations that run over the input text at both training and eval time.
5
+
6
+ Cleaners can be selected by passing a comma-delimited list of cleaner names as the "cleaners"
7
+ hyperparameter. Some cleaners are English-specific. You'll typically want to use:
8
+ 1. "english_cleaners" for English text
9
+ 2. "transliteration_cleaners" for non-English text that can be transliterated to ASCII using
10
+ the Unidecode library (https://pypi.python.org/pypi/Unidecode)
11
+ 3. "basic_cleaners" if you do not want to transliterate (in this case, you should also update
12
+ the symbols in symbols.py to match your data).
13
+ '''
14
+
15
+
16
+ # Regular expression matching whitespace:
17
+
18
+
19
+ import re
20
+ import inflect
21
+ from unidecode import unidecode
22
+ import eng_to_ipa as ipa
23
+ _inflect = inflect.engine()
24
+ _comma_number_re = re.compile(r'([0-9][0-9\,]+[0-9])')
25
+ _decimal_number_re = re.compile(r'([0-9]+\.[0-9]+)')
26
+ _pounds_re = re.compile(r'£([0-9\,]*[0-9]+)')
27
+ _dollars_re = re.compile(r'\$([0-9\.\,]*[0-9]+)')
28
+ _ordinal_re = re.compile(r'[0-9]+(st|nd|rd|th)')
29
+ _number_re = re.compile(r'[0-9]+')
30
+
31
+ # List of (regular expression, replacement) pairs for abbreviations:
32
+ _abbreviations = [(re.compile('\\b%s\\.' % x[0], re.IGNORECASE), x[1]) for x in [
33
+ ('mrs', 'misess'),
34
+ ('mr', 'mister'),
35
+ ('dr', 'doctor'),
36
+ ('st', 'saint'),
37
+ ('co', 'company'),
38
+ ('jr', 'junior'),
39
+ ('maj', 'major'),
40
+ ('gen', 'general'),
41
+ ('drs', 'doctors'),
42
+ ('rev', 'reverend'),
43
+ ('lt', 'lieutenant'),
44
+ ('hon', 'honorable'),
45
+ ('sgt', 'sergeant'),
46
+ ('capt', 'captain'),
47
+ ('esq', 'esquire'),
48
+ ('ltd', 'limited'),
49
+ ('col', 'colonel'),
50
+ ('ft', 'fort'),
51
+ ]]
52
+
53
+
54
+ # List of (ipa, lazy ipa) pairs:
55
+ _lazy_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
56
+ ('r', 'ɹ'),
57
+ ('æ', 'e'),
58
+ ('ɑ', 'a'),
59
+ ('ɔ', 'o'),
60
+ ('ð', 'z'),
61
+ ('θ', 's'),
62
+ ('ɛ', 'e'),
63
+ ('ɪ', 'i'),
64
+ ('ʊ', 'u'),
65
+ ('ʒ', 'ʥ'),
66
+ ('ʤ', 'ʥ'),
67
+ ('ˈ', '↓'),
68
+ ]]
69
+
70
+ # List of (ipa, lazy ipa2) pairs:
71
+ _lazy_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
72
+ ('r', 'ɹ'),
73
+ ('ð', 'z'),
74
+ ('θ', 's'),
75
+ ('ʒ', 'ʑ'),
76
+ ('ʤ', 'dʑ'),
77
+ ('ˈ', '↓'),
78
+ ]]
79
+
80
+ # List of (ipa, ipa2) pairs
81
+ _ipa_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
82
+ ('r', 'ɹ'),
83
+ ('ʤ', 'dʒ'),
84
+ ('ʧ', 'tʃ')
85
+ ]]
86
+
87
+
88
+ def expand_abbreviations(text):
89
+ for regex, replacement in _abbreviations:
90
+ text = re.sub(regex, replacement, text)
91
+ return text
92
+
93
+
94
+ def collapse_whitespace(text):
95
+ return re.sub(r'\s+', ' ', text)
96
+
97
+
98
+ def _remove_commas(m):
99
+ return m.group(1).replace(',', '')
100
+
101
+
102
+ def _expand_decimal_point(m):
103
+ return m.group(1).replace('.', ' point ')
104
+
105
+
106
+ def _expand_dollars(m):
107
+ match = m.group(1)
108
+ parts = match.split('.')
109
+ if len(parts) > 2:
110
+ return match + ' dollars' # Unexpected format
111
+ dollars = int(parts[0]) if parts[0] else 0
112
+ cents = int(parts[1]) if len(parts) > 1 and parts[1] else 0
113
+ if dollars and cents:
114
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
115
+ cent_unit = 'cent' if cents == 1 else 'cents'
116
+ return '%s %s, %s %s' % (dollars, dollar_unit, cents, cent_unit)
117
+ elif dollars:
118
+ dollar_unit = 'dollar' if dollars == 1 else 'dollars'
119
+ return '%s %s' % (dollars, dollar_unit)
120
+ elif cents:
121
+ cent_unit = 'cent' if cents == 1 else 'cents'
122
+ return '%s %s' % (cents, cent_unit)
123
+ else:
124
+ return 'zero dollars'
125
+
126
+
127
+ def _expand_ordinal(m):
128
+ return _inflect.number_to_words(m.group(0))
129
+
130
+
131
+ def _expand_number(m):
132
+ num = int(m.group(0))
133
+ if num > 1000 and num < 3000:
134
+ if num == 2000:
135
+ return 'two thousand'
136
+ elif num > 2000 and num < 2010:
137
+ return 'two thousand ' + _inflect.number_to_words(num % 100)
138
+ elif num % 100 == 0:
139
+ return _inflect.number_to_words(num // 100) + ' hundred'
140
+ else:
141
+ return _inflect.number_to_words(num, andword='', zero='oh', group=2).replace(', ', ' ')
142
+ else:
143
+ return _inflect.number_to_words(num, andword='')
144
+
145
+
146
+ def normalize_numbers(text):
147
+ text = re.sub(_comma_number_re, _remove_commas, text)
148
+ text = re.sub(_pounds_re, r'\1 pounds', text)
149
+ text = re.sub(_dollars_re, _expand_dollars, text)
150
+ text = re.sub(_decimal_number_re, _expand_decimal_point, text)
151
+ text = re.sub(_ordinal_re, _expand_ordinal, text)
152
+ text = re.sub(_number_re, _expand_number, text)
153
+ return text
154
+
155
+
156
+ def mark_dark_l(text):
157
+ return re.sub(r'l([^aeiouæɑɔəɛɪʊ ]*(?: |$))', lambda x: 'ɫ'+x.group(1), text)
158
+
159
+
160
+ def english_to_ipa(text):
161
+ text = unidecode(text).lower()
162
+ text = expand_abbreviations(text)
163
+ text = normalize_numbers(text)
164
+ phonemes = ipa.convert(text)
165
+ phonemes = collapse_whitespace(phonemes)
166
+ return phonemes
167
+
168
+
169
+ def english_to_lazy_ipa(text):
170
+ text = english_to_ipa(text)
171
+ for regex, replacement in _lazy_ipa:
172
+ text = re.sub(regex, replacement, text)
173
+ return text
174
+
175
+
176
+ def english_to_ipa2(text):
177
+ text = english_to_ipa(text)
178
+ text = mark_dark_l(text)
179
+ for regex, replacement in _ipa_to_ipa2:
180
+ text = re.sub(regex, replacement, text)
181
+ return text.replace('...', '…')
182
+
183
+
184
+ def english_to_lazy_ipa2(text):
185
+ text = english_to_ipa(text)
186
+ for regex, replacement in _lazy_ipa2:
187
+ text = re.sub(regex, replacement, text)
188
+ return text
dreamvoice/train_utils/src/openvoice/text/mandarin.py ADDED
@@ -0,0 +1,326 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import re
4
+ from pypinyin import lazy_pinyin, BOPOMOFO
5
+ import jieba
6
+ import cn2an
7
+ import logging
8
+
9
+
10
+ # List of (Latin alphabet, bopomofo) pairs:
11
+ _latin_to_bopomofo = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
12
+ ('a', 'ㄟˉ'),
13
+ ('b', 'ㄅㄧˋ'),
14
+ ('c', 'ㄙㄧˉ'),
15
+ ('d', 'ㄉㄧˋ'),
16
+ ('e', 'ㄧˋ'),
17
+ ('f', 'ㄝˊㄈㄨˋ'),
18
+ ('g', 'ㄐㄧˋ'),
19
+ ('h', 'ㄝˇㄑㄩˋ'),
20
+ ('i', 'ㄞˋ'),
21
+ ('j', 'ㄐㄟˋ'),
22
+ ('k', 'ㄎㄟˋ'),
23
+ ('l', 'ㄝˊㄛˋ'),
24
+ ('m', 'ㄝˊㄇㄨˋ'),
25
+ ('n', 'ㄣˉ'),
26
+ ('o', 'ㄡˉ'),
27
+ ('p', 'ㄆㄧˉ'),
28
+ ('q', 'ㄎㄧㄡˉ'),
29
+ ('r', 'ㄚˋ'),
30
+ ('s', 'ㄝˊㄙˋ'),
31
+ ('t', 'ㄊㄧˋ'),
32
+ ('u', 'ㄧㄡˉ'),
33
+ ('v', 'ㄨㄧˉ'),
34
+ ('w', 'ㄉㄚˋㄅㄨˋㄌㄧㄡˋ'),
35
+ ('x', 'ㄝˉㄎㄨˋㄙˋ'),
36
+ ('y', 'ㄨㄞˋ'),
37
+ ('z', 'ㄗㄟˋ')
38
+ ]]
39
+
40
+ # List of (bopomofo, romaji) pairs:
41
+ _bopomofo_to_romaji = [(re.compile('%s' % x[0]), x[1]) for x in [
42
+ ('ㄅㄛ', 'p⁼wo'),
43
+ ('ㄆㄛ', 'pʰwo'),
44
+ ('ㄇㄛ', 'mwo'),
45
+ ('ㄈㄛ', 'fwo'),
46
+ ('ㄅ', 'p⁼'),
47
+ ('ㄆ', 'pʰ'),
48
+ ('ㄇ', 'm'),
49
+ ('ㄈ', 'f'),
50
+ ('ㄉ', 't⁼'),
51
+ ('ㄊ', 'tʰ'),
52
+ ('ㄋ', 'n'),
53
+ ('ㄌ', 'l'),
54
+ ('ㄍ', 'k⁼'),
55
+ ('ㄎ', 'kʰ'),
56
+ ('ㄏ', 'h'),
57
+ ('ㄐ', 'ʧ⁼'),
58
+ ('ㄑ', 'ʧʰ'),
59
+ ('ㄒ', 'ʃ'),
60
+ ('ㄓ', 'ʦ`⁼'),
61
+ ('ㄔ', 'ʦ`ʰ'),
62
+ ('ㄕ', 's`'),
63
+ ('ㄖ', 'ɹ`'),
64
+ ('ㄗ', 'ʦ⁼'),
65
+ ('ㄘ', 'ʦʰ'),
66
+ ('ㄙ', 's'),
67
+ ('ㄚ', 'a'),
68
+ ('ㄛ', 'o'),
69
+ ('ㄜ', 'ə'),
70
+ ('ㄝ', 'e'),
71
+ ('ㄞ', 'ai'),
72
+ ('ㄟ', 'ei'),
73
+ ('ㄠ', 'au'),
74
+ ('ㄡ', 'ou'),
75
+ ('ㄧㄢ', 'yeNN'),
76
+ ('ㄢ', 'aNN'),
77
+ ('ㄧㄣ', 'iNN'),
78
+ ('ㄣ', 'əNN'),
79
+ ('ㄤ', 'aNg'),
80
+ ('ㄧㄥ', 'iNg'),
81
+ ('ㄨㄥ', 'uNg'),
82
+ ('ㄩㄥ', 'yuNg'),
83
+ ('ㄥ', 'əNg'),
84
+ ('ㄦ', 'əɻ'),
85
+ ('ㄧ', 'i'),
86
+ ('ㄨ', 'u'),
87
+ ('ㄩ', 'ɥ'),
88
+ ('ˉ', '→'),
89
+ ('ˊ', '↑'),
90
+ ('ˇ', '↓↑'),
91
+ ('ˋ', '↓'),
92
+ ('˙', ''),
93
+ (',', ','),
94
+ ('。', '.'),
95
+ ('!', '!'),
96
+ ('?', '?'),
97
+ ('—', '-')
98
+ ]]
99
+
100
+ # List of (romaji, ipa) pairs:
101
+ _romaji_to_ipa = [(re.compile('%s' % x[0], re.IGNORECASE), x[1]) for x in [
102
+ ('ʃy', 'ʃ'),
103
+ ('ʧʰy', 'ʧʰ'),
104
+ ('ʧ⁼y', 'ʧ⁼'),
105
+ ('NN', 'n'),
106
+ ('Ng', 'ŋ'),
107
+ ('y', 'j'),
108
+ ('h', 'x')
109
+ ]]
110
+
111
+ # List of (bopomofo, ipa) pairs:
112
+ _bopomofo_to_ipa = [(re.compile('%s' % x[0]), x[1]) for x in [
113
+ ('ㄅㄛ', 'p⁼wo'),
114
+ ('ㄆㄛ', 'pʰwo'),
115
+ ('ㄇㄛ', 'mwo'),
116
+ ('ㄈㄛ', 'fwo'),
117
+ ('ㄅ', 'p⁼'),
118
+ ('ㄆ', 'pʰ'),
119
+ ('ㄇ', 'm'),
120
+ ('ㄈ', 'f'),
121
+ ('ㄉ', 't⁼'),
122
+ ('ㄊ', 'tʰ'),
123
+ ('ㄋ', 'n'),
124
+ ('ㄌ', 'l'),
125
+ ('ㄍ', 'k⁼'),
126
+ ('ㄎ', 'kʰ'),
127
+ ('ㄏ', 'x'),
128
+ ('ㄐ', 'tʃ⁼'),
129
+ ('ㄑ', 'tʃʰ'),
130
+ ('ㄒ', 'ʃ'),
131
+ ('ㄓ', 'ts`⁼'),
132
+ ('ㄔ', 'ts`ʰ'),
133
+ ('ㄕ', 's`'),
134
+ ('ㄖ', 'ɹ`'),
135
+ ('ㄗ', 'ts⁼'),
136
+ ('ㄘ', 'tsʰ'),
137
+ ('ㄙ', 's'),
138
+ ('ㄚ', 'a'),
139
+ ('ㄛ', 'o'),
140
+ ('ㄜ', 'ə'),
141
+ ('ㄝ', 'ɛ'),
142
+ ('ㄞ', 'aɪ'),
143
+ ('ㄟ', 'eɪ'),
144
+ ('ㄠ', 'ɑʊ'),
145
+ ('ㄡ', 'oʊ'),
146
+ ('ㄧㄢ', 'jɛn'),
147
+ ('ㄩㄢ', 'ɥæn'),
148
+ ('ㄢ', 'an'),
149
+ ('ㄧㄣ', 'in'),
150
+ ('ㄩㄣ', 'ɥn'),
151
+ ('ㄣ', 'ən'),
152
+ ('ㄤ', 'ɑŋ'),
153
+ ('ㄧㄥ', 'iŋ'),
154
+ ('ㄨㄥ', 'ʊŋ'),
155
+ ('ㄩㄥ', 'jʊŋ'),
156
+ ('ㄥ', 'əŋ'),
157
+ ('ㄦ', 'əɻ'),
158
+ ('ㄧ', 'i'),
159
+ ('ㄨ', 'u'),
160
+ ('ㄩ', 'ɥ'),
161
+ ('ˉ', '→'),
162
+ ('ˊ', '↑'),
163
+ ('ˇ', '↓↑'),
164
+ ('ˋ', '↓'),
165
+ ('˙', ''),
166
+ (',', ','),
167
+ ('。', '.'),
168
+ ('!', '!'),
169
+ ('?', '?'),
170
+ ('—', '-')
171
+ ]]
172
+
173
+ # List of (bopomofo, ipa2) pairs:
174
+ _bopomofo_to_ipa2 = [(re.compile('%s' % x[0]), x[1]) for x in [
175
+ ('ㄅㄛ', 'pwo'),
176
+ ('ㄆㄛ', 'pʰwo'),
177
+ ('ㄇㄛ', 'mwo'),
178
+ ('ㄈㄛ', 'fwo'),
179
+ ('ㄅ', 'p'),
180
+ ('ㄆ', 'pʰ'),
181
+ ('ㄇ', 'm'),
182
+ ('ㄈ', 'f'),
183
+ ('ㄉ', 't'),
184
+ ('ㄊ', 'tʰ'),
185
+ ('ㄋ', 'n'),
186
+ ('ㄌ', 'l'),
187
+ ('ㄍ', 'k'),
188
+ ('ㄎ', 'kʰ'),
189
+ ('ㄏ', 'h'),
190
+ ('ㄐ', 'tɕ'),
191
+ ('ㄑ', 'tɕʰ'),
192
+ ('ㄒ', 'ɕ'),
193
+ ('ㄓ', 'tʂ'),
194
+ ('ㄔ', 'tʂʰ'),
195
+ ('ㄕ', 'ʂ'),
196
+ ('ㄖ', 'ɻ'),
197
+ ('ㄗ', 'ts'),
198
+ ('ㄘ', 'tsʰ'),
199
+ ('ㄙ', 's'),
200
+ ('ㄚ', 'a'),
201
+ ('ㄛ', 'o'),
202
+ ('ㄜ', 'ɤ'),
203
+ ('ㄝ', 'ɛ'),
204
+ ('ㄞ', 'aɪ'),
205
+ ('ㄟ', 'eɪ'),
206
+ ('ㄠ', 'ɑʊ'),
207
+ ('ㄡ', 'oʊ'),
208
+ ('ㄧㄢ', 'jɛn'),
209
+ ('ㄩㄢ', 'yæn'),
210
+ ('ㄢ', 'an'),
211
+ ('ㄧㄣ', 'in'),
212
+ ('ㄩㄣ', 'yn'),
213
+ ('ㄣ', 'ən'),
214
+ ('ㄤ', 'ɑŋ'),
215
+ ('ㄧㄥ', 'iŋ'),
216
+ ('ㄨㄥ', 'ʊŋ'),
217
+ ('ㄩㄥ', 'jʊŋ'),
218
+ ('ㄥ', 'ɤŋ'),
219
+ ('ㄦ', 'əɻ'),
220
+ ('ㄧ', 'i'),
221
+ ('ㄨ', 'u'),
222
+ ('ㄩ', 'y'),
223
+ ('ˉ', '˥'),
224
+ ('ˊ', '˧˥'),
225
+ ('ˇ', '˨˩˦'),
226
+ ('ˋ', '˥˩'),
227
+ ('˙', ''),
228
+ (',', ','),
229
+ ('。', '.'),
230
+ ('!', '!'),
231
+ ('?', '?'),
232
+ ('—', '-')
233
+ ]]
234
+
235
+
236
+ def number_to_chinese(text):
237
+ numbers = re.findall(r'\d+(?:\.?\d+)?', text)
238
+ for number in numbers:
239
+ text = text.replace(number, cn2an.an2cn(number), 1)
240
+ return text
241
+
242
+
243
+ def chinese_to_bopomofo(text):
244
+ text = text.replace('、', ',').replace(';', ',').replace(':', ',')
245
+ words = jieba.lcut(text, cut_all=False)
246
+ text = ''
247
+ for word in words:
248
+ bopomofos = lazy_pinyin(word, BOPOMOFO)
249
+ if not re.search('[\u4e00-\u9fff]', word):
250
+ text += word
251
+ continue
252
+ for i in range(len(bopomofos)):
253
+ bopomofos[i] = re.sub(r'([\u3105-\u3129])$', r'\1ˉ', bopomofos[i])
254
+ if text != '':
255
+ text += ' '
256
+ text += ''.join(bopomofos)
257
+ return text
258
+
259
+
260
+ def latin_to_bopomofo(text):
261
+ for regex, replacement in _latin_to_bopomofo:
262
+ text = re.sub(regex, replacement, text)
263
+ return text
264
+
265
+
266
+ def bopomofo_to_romaji(text):
267
+ for regex, replacement in _bopomofo_to_romaji:
268
+ text = re.sub(regex, replacement, text)
269
+ return text
270
+
271
+
272
+ def bopomofo_to_ipa(text):
273
+ for regex, replacement in _bopomofo_to_ipa:
274
+ text = re.sub(regex, replacement, text)
275
+ return text
276
+
277
+
278
+ def bopomofo_to_ipa2(text):
279
+ for regex, replacement in _bopomofo_to_ipa2:
280
+ text = re.sub(regex, replacement, text)
281
+ return text
282
+
283
+
284
+ def chinese_to_romaji(text):
285
+ text = number_to_chinese(text)
286
+ text = chinese_to_bopomofo(text)
287
+ text = latin_to_bopomofo(text)
288
+ text = bopomofo_to_romaji(text)
289
+ text = re.sub('i([aoe])', r'y\1', text)
290
+ text = re.sub('u([aoəe])', r'w\1', text)
291
+ text = re.sub('([ʦsɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
292
+ r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
293
+ text = re.sub('([ʦs][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
294
+ return text
295
+
296
+
297
+ def chinese_to_lazy_ipa(text):
298
+ text = chinese_to_romaji(text)
299
+ for regex, replacement in _romaji_to_ipa:
300
+ text = re.sub(regex, replacement, text)
301
+ return text
302
+
303
+
304
+ def chinese_to_ipa(text):
305
+ text = number_to_chinese(text)
306
+ text = chinese_to_bopomofo(text)
307
+ text = latin_to_bopomofo(text)
308
+ text = bopomofo_to_ipa(text)
309
+ text = re.sub('i([aoe])', r'j\1', text)
310
+ text = re.sub('u([aoəe])', r'w\1', text)
311
+ text = re.sub('([sɹ]`[⁼ʰ]?)([→↓↑ ]+|$)',
312
+ r'\1ɹ`\2', text).replace('ɻ', 'ɹ`')
313
+ text = re.sub('([s][⁼ʰ]?)([→↓↑ ]+|$)', r'\1ɹ\2', text)
314
+ return text
315
+
316
+
317
+ def chinese_to_ipa2(text):
318
+ text = number_to_chinese(text)
319
+ text = chinese_to_bopomofo(text)
320
+ text = latin_to_bopomofo(text)
321
+ text = bopomofo_to_ipa2(text)
322
+ text = re.sub(r'i([aoe])', r'j\1', text)
323
+ text = re.sub(r'u([aoəe])', r'w\1', text)
324
+ text = re.sub(r'([ʂɹ]ʰ?)([˩˨˧˦˥ ]+|$)', r'\1ʅ\2', text)
325
+ text = re.sub(r'(sʰ?)([˩˨˧˦˥ ]+|$)', r'\1ɿ\2', text)
326
+ return text
dreamvoice/train_utils/src/openvoice/text/symbols.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ '''
2
+ Defines the set of symbols used in text input to the model.
3
+ '''
4
+
5
+ # japanese_cleaners
6
+ # _pad = '_'
7
+ # _punctuation = ',.!?-'
8
+ # _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧ↓↑ '
9
+
10
+
11
+ '''# japanese_cleaners2
12
+ _pad = '_'
13
+ _punctuation = ',.!?-~…'
14
+ _letters = 'AEINOQUabdefghijkmnoprstuvwyzʃʧʦ↓↑ '
15
+ '''
16
+
17
+
18
+ '''# korean_cleaners
19
+ _pad = '_'
20
+ _punctuation = ',.!?…~'
21
+ _letters = 'ㄱㄴㄷㄹㅁㅂㅅㅇㅈㅊㅋㅌㅍㅎㄲㄸㅃㅆㅉㅏㅓㅗㅜㅡㅣㅐㅔ '
22
+ '''
23
+
24
+ '''# chinese_cleaners
25
+ _pad = '_'
26
+ _punctuation = ',。!?—…'
27
+ _letters = 'ㄅㄆㄇㄈㄉㄊㄋㄌㄍㄎㄏㄐㄑㄒㄓㄔㄕㄖㄗㄘㄙㄚㄛㄜㄝㄞㄟㄠㄡㄢㄣㄤㄥㄦㄧㄨㄩˉˊˇˋ˙ '
28
+ '''
29
+
30
+ # # zh_ja_mixture_cleaners
31
+ # _pad = '_'
32
+ # _punctuation = ',.!?-~…'
33
+ # _letters = 'AEINOQUabdefghijklmnoprstuvwyzʃʧʦɯɹəɥ⁼ʰ`→↓↑ '
34
+
35
+
36
+ '''# sanskrit_cleaners
37
+ _pad = '_'
38
+ _punctuation = '।'
39
+ _letters = 'ँंःअआइईउऊऋएऐओऔकखगघङचछजझञटठडढणतथदधनपफबभमयरलळवशषसहऽािीुूृॄेैोौ्ॠॢ '
40
+ '''
41
+
42
+ '''# cjks_cleaners
43
+ _pad = '_'
44
+ _punctuation = ',.!?-~…'
45
+ _letters = 'NQabdefghijklmnopstuvwxyzʃʧʥʦɯɹəɥçɸɾβŋɦː⁼ʰ`^#*=→↓↑ '
46
+ '''
47
+
48
+ '''# thai_cleaners
49
+ _pad = '_'
50
+ _punctuation = '.!? '
51
+ _letters = 'กขฃคฆงจฉชซฌญฎฏฐฑฒณดตถทธนบปผฝพฟภมยรฤลวศษสหฬอฮฯะัาำิีึืุูเแโใไๅๆ็่้๊๋์'
52
+ '''
53
+
54
+ # # cjke_cleaners2
55
+ _pad = '_'
56
+ _punctuation = ',.!?-~…'
57
+ _letters = 'NQabdefghijklmnopstuvwxyzɑæʃʑçɯɪɔɛɹðəɫɥɸʊɾʒθβŋɦ⁼ʰ`^#*=ˈˌ→↓↑ '
58
+
59
+
60
+ '''# shanghainese_cleaners
61
+ _pad = '_'
62
+ _punctuation = ',.!?…'
63
+ _letters = 'abdfghiklmnopstuvyzøŋȵɑɔɕəɤɦɪɿʑʔʰ̩̃ᴀᴇ15678 '
64
+ '''
65
+
66
+ '''# chinese_dialect_cleaners
67
+ _pad = '_'
68
+ _punctuation = ',.!?~…─'
69
+ _letters = '#Nabdefghijklmnoprstuvwxyzæçøŋœȵɐɑɒɓɔɕɗɘəɚɛɜɣɤɦɪɭɯɵɷɸɻɾɿʂʅʊʋʌʏʑʔʦʮʰʷˀː˥˦˧˨˩̥̩̃̚ᴀᴇ↑↓∅ⱼ '
70
+ '''
71
+
72
+ # Export all symbols:
73
+ symbols = [_pad] + list(_punctuation) + list(_letters)
74
+
75
+ # Special symbol ids
76
+ SPACE_ID = symbols.index(" ")
77
+
78
+ num_ja_tones = 1
79
+ num_kr_tones = 1
80
+ num_zh_tones = 6
81
+ num_en_tones = 4
82
+
83
+ language_tone_start_map = {
84
+ "ZH": 0,
85
+ "JP": num_zh_tones,
86
+ "EN": num_zh_tones + num_ja_tones,
87
+ 'KR': num_zh_tones + num_ja_tones + num_en_tones,
88
+ }
dreamvoice/train_utils/src/openvoice/transforms.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from torch.nn import functional as F
3
+
4
+ import numpy as np
5
+
6
+
7
+ DEFAULT_MIN_BIN_WIDTH = 1e-3
8
+ DEFAULT_MIN_BIN_HEIGHT = 1e-3
9
+ DEFAULT_MIN_DERIVATIVE = 1e-3
10
+
11
+
12
+ def piecewise_rational_quadratic_transform(
13
+ inputs,
14
+ unnormalized_widths,
15
+ unnormalized_heights,
16
+ unnormalized_derivatives,
17
+ inverse=False,
18
+ tails=None,
19
+ tail_bound=1.0,
20
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
21
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
22
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
23
+ ):
24
+ if tails is None:
25
+ spline_fn = rational_quadratic_spline
26
+ spline_kwargs = {}
27
+ else:
28
+ spline_fn = unconstrained_rational_quadratic_spline
29
+ spline_kwargs = {"tails": tails, "tail_bound": tail_bound}
30
+
31
+ outputs, logabsdet = spline_fn(
32
+ inputs=inputs,
33
+ unnormalized_widths=unnormalized_widths,
34
+ unnormalized_heights=unnormalized_heights,
35
+ unnormalized_derivatives=unnormalized_derivatives,
36
+ inverse=inverse,
37
+ min_bin_width=min_bin_width,
38
+ min_bin_height=min_bin_height,
39
+ min_derivative=min_derivative,
40
+ **spline_kwargs
41
+ )
42
+ return outputs, logabsdet
43
+
44
+
45
+ def searchsorted(bin_locations, inputs, eps=1e-6):
46
+ bin_locations[..., -1] += eps
47
+ return torch.sum(inputs[..., None] >= bin_locations, dim=-1) - 1
48
+
49
+
50
+ def unconstrained_rational_quadratic_spline(
51
+ inputs,
52
+ unnormalized_widths,
53
+ unnormalized_heights,
54
+ unnormalized_derivatives,
55
+ inverse=False,
56
+ tails="linear",
57
+ tail_bound=1.0,
58
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
59
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
60
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
61
+ ):
62
+ inside_interval_mask = (inputs >= -tail_bound) & (inputs <= tail_bound)
63
+ outside_interval_mask = ~inside_interval_mask
64
+
65
+ outputs = torch.zeros_like(inputs)
66
+ logabsdet = torch.zeros_like(inputs)
67
+
68
+ if tails == "linear":
69
+ unnormalized_derivatives = F.pad(unnormalized_derivatives, pad=(1, 1))
70
+ constant = np.log(np.exp(1 - min_derivative) - 1)
71
+ unnormalized_derivatives[..., 0] = constant
72
+ unnormalized_derivatives[..., -1] = constant
73
+
74
+ outputs[outside_interval_mask] = inputs[outside_interval_mask]
75
+ logabsdet[outside_interval_mask] = 0
76
+ else:
77
+ raise RuntimeError("{} tails are not implemented.".format(tails))
78
+
79
+ (
80
+ outputs[inside_interval_mask],
81
+ logabsdet[inside_interval_mask],
82
+ ) = rational_quadratic_spline(
83
+ inputs=inputs[inside_interval_mask],
84
+ unnormalized_widths=unnormalized_widths[inside_interval_mask, :],
85
+ unnormalized_heights=unnormalized_heights[inside_interval_mask, :],
86
+ unnormalized_derivatives=unnormalized_derivatives[inside_interval_mask, :],
87
+ inverse=inverse,
88
+ left=-tail_bound,
89
+ right=tail_bound,
90
+ bottom=-tail_bound,
91
+ top=tail_bound,
92
+ min_bin_width=min_bin_width,
93
+ min_bin_height=min_bin_height,
94
+ min_derivative=min_derivative,
95
+ )
96
+
97
+ return outputs, logabsdet
98
+
99
+
100
+ def rational_quadratic_spline(
101
+ inputs,
102
+ unnormalized_widths,
103
+ unnormalized_heights,
104
+ unnormalized_derivatives,
105
+ inverse=False,
106
+ left=0.0,
107
+ right=1.0,
108
+ bottom=0.0,
109
+ top=1.0,
110
+ min_bin_width=DEFAULT_MIN_BIN_WIDTH,
111
+ min_bin_height=DEFAULT_MIN_BIN_HEIGHT,
112
+ min_derivative=DEFAULT_MIN_DERIVATIVE,
113
+ ):
114
+ if torch.min(inputs) < left or torch.max(inputs) > right:
115
+ raise ValueError("Input to a transform is not within its domain")
116
+
117
+ num_bins = unnormalized_widths.shape[-1]
118
+
119
+ if min_bin_width * num_bins > 1.0:
120
+ raise ValueError("Minimal bin width too large for the number of bins")
121
+ if min_bin_height * num_bins > 1.0:
122
+ raise ValueError("Minimal bin height too large for the number of bins")
123
+
124
+ widths = F.softmax(unnormalized_widths, dim=-1)
125
+ widths = min_bin_width + (1 - min_bin_width * num_bins) * widths
126
+ cumwidths = torch.cumsum(widths, dim=-1)
127
+ cumwidths = F.pad(cumwidths, pad=(1, 0), mode="constant", value=0.0)
128
+ cumwidths = (right - left) * cumwidths + left
129
+ cumwidths[..., 0] = left
130
+ cumwidths[..., -1] = right
131
+ widths = cumwidths[..., 1:] - cumwidths[..., :-1]
132
+
133
+ derivatives = min_derivative + F.softplus(unnormalized_derivatives)
134
+
135
+ heights = F.softmax(unnormalized_heights, dim=-1)
136
+ heights = min_bin_height + (1 - min_bin_height * num_bins) * heights
137
+ cumheights = torch.cumsum(heights, dim=-1)
138
+ cumheights = F.pad(cumheights, pad=(1, 0), mode="constant", value=0.0)
139
+ cumheights = (top - bottom) * cumheights + bottom
140
+ cumheights[..., 0] = bottom
141
+ cumheights[..., -1] = top
142
+ heights = cumheights[..., 1:] - cumheights[..., :-1]
143
+
144
+ if inverse:
145
+ bin_idx = searchsorted(cumheights, inputs)[..., None]
146
+ else:
147
+ bin_idx = searchsorted(cumwidths, inputs)[..., None]
148
+
149
+ input_cumwidths = cumwidths.gather(-1, bin_idx)[..., 0]
150
+ input_bin_widths = widths.gather(-1, bin_idx)[..., 0]
151
+
152
+ input_cumheights = cumheights.gather(-1, bin_idx)[..., 0]
153
+ delta = heights / widths
154
+ input_delta = delta.gather(-1, bin_idx)[..., 0]
155
+
156
+ input_derivatives = derivatives.gather(-1, bin_idx)[..., 0]
157
+ input_derivatives_plus_one = derivatives[..., 1:].gather(-1, bin_idx)[..., 0]
158
+
159
+ input_heights = heights.gather(-1, bin_idx)[..., 0]
160
+
161
+ if inverse:
162
+ a = (inputs - input_cumheights) * (
163
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
164
+ ) + input_heights * (input_delta - input_derivatives)
165
+ b = input_heights * input_derivatives - (inputs - input_cumheights) * (
166
+ input_derivatives + input_derivatives_plus_one - 2 * input_delta
167
+ )
168
+ c = -input_delta * (inputs - input_cumheights)
169
+
170
+ discriminant = b.pow(2) - 4 * a * c
171
+ assert (discriminant >= 0).all()
172
+
173
+ root = (2 * c) / (-b - torch.sqrt(discriminant))
174
+ outputs = root * input_bin_widths + input_cumwidths
175
+
176
+ theta_one_minus_theta = root * (1 - root)
177
+ denominator = input_delta + (
178
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
179
+ * theta_one_minus_theta
180
+ )
181
+ derivative_numerator = input_delta.pow(2) * (
182
+ input_derivatives_plus_one * root.pow(2)
183
+ + 2 * input_delta * theta_one_minus_theta
184
+ + input_derivatives * (1 - root).pow(2)
185
+ )
186
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
187
+
188
+ return outputs, -logabsdet
189
+ else:
190
+ theta = (inputs - input_cumwidths) / input_bin_widths
191
+ theta_one_minus_theta = theta * (1 - theta)
192
+
193
+ numerator = input_heights * (
194
+ input_delta * theta.pow(2) + input_derivatives * theta_one_minus_theta
195
+ )
196
+ denominator = input_delta + (
197
+ (input_derivatives + input_derivatives_plus_one - 2 * input_delta)
198
+ * theta_one_minus_theta
199
+ )
200
+ outputs = input_cumheights + numerator / denominator
201
+
202
+ derivative_numerator = input_delta.pow(2) * (
203
+ input_derivatives_plus_one * theta.pow(2)
204
+ + 2 * input_delta * theta_one_minus_theta
205
+ + input_derivatives * (1 - theta).pow(2)
206
+ )
207
+ logabsdet = torch.log(derivative_numerator) - 2 * torch.log(denominator)
208
+
209
+ return outputs, logabsdet
dreamvoice/train_utils/src/openvoice/utils.py ADDED
@@ -0,0 +1,194 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import numpy as np
4
+
5
+
6
+ def get_hparams_from_file(config_path):
7
+ with open(config_path, "r", encoding="utf-8") as f:
8
+ data = f.read()
9
+ config = json.loads(data)
10
+
11
+ hparams = HParams(**config)
12
+ return hparams
13
+
14
+ class HParams:
15
+ def __init__(self, **kwargs):
16
+ for k, v in kwargs.items():
17
+ if type(v) == dict:
18
+ v = HParams(**v)
19
+ self[k] = v
20
+
21
+ def keys(self):
22
+ return self.__dict__.keys()
23
+
24
+ def items(self):
25
+ return self.__dict__.items()
26
+
27
+ def values(self):
28
+ return self.__dict__.values()
29
+
30
+ def __len__(self):
31
+ return len(self.__dict__)
32
+
33
+ def __getitem__(self, key):
34
+ return getattr(self, key)
35
+
36
+ def __setitem__(self, key, value):
37
+ return setattr(self, key, value)
38
+
39
+ def __contains__(self, key):
40
+ return key in self.__dict__
41
+
42
+ def __repr__(self):
43
+ return self.__dict__.__repr__()
44
+
45
+
46
+ def string_to_bits(string, pad_len=8):
47
+ # Convert each character to its ASCII value
48
+ ascii_values = [ord(char) for char in string]
49
+
50
+ # Convert ASCII values to binary representation
51
+ binary_values = [bin(value)[2:].zfill(8) for value in ascii_values]
52
+
53
+ # Convert binary strings to integer arrays
54
+ bit_arrays = [[int(bit) for bit in binary] for binary in binary_values]
55
+
56
+ # Convert list of arrays to NumPy array
57
+ numpy_array = np.array(bit_arrays)
58
+ numpy_array_full = np.zeros((pad_len, 8), dtype=numpy_array.dtype)
59
+ numpy_array_full[:, 2] = 1
60
+ max_len = min(pad_len, len(numpy_array))
61
+ numpy_array_full[:max_len] = numpy_array[:max_len]
62
+ return numpy_array_full
63
+
64
+
65
+ def bits_to_string(bits_array):
66
+ # Convert each row of the array to a binary string
67
+ binary_values = [''.join(str(bit) for bit in row) for row in bits_array]
68
+
69
+ # Convert binary strings to ASCII values
70
+ ascii_values = [int(binary, 2) for binary in binary_values]
71
+
72
+ # Convert ASCII values to characters
73
+ output_string = ''.join(chr(value) for value in ascii_values)
74
+
75
+ return output_string
76
+
77
+
78
+ def split_sentence(text, min_len=10, language_str='[EN]'):
79
+ if language_str in ['EN']:
80
+ sentences = split_sentences_latin(text, min_len=min_len)
81
+ else:
82
+ sentences = split_sentences_zh(text, min_len=min_len)
83
+ return sentences
84
+
85
+ def split_sentences_latin(text, min_len=10):
86
+ """Split Long sentences into list of short ones
87
+
88
+ Args:
89
+ str: Input sentences.
90
+
91
+ Returns:
92
+ List[str]: list of output sentences.
93
+ """
94
+ # deal with dirty sentences
95
+ text = re.sub('[。!?;]', '.', text)
96
+ text = re.sub('[,]', ',', text)
97
+ text = re.sub('[“”]', '"', text)
98
+ text = re.sub('[‘’]', "'", text)
99
+ text = re.sub(r"[\<\>\(\)\[\]\"\«\»]+", "", text)
100
+ text = re.sub('[\n\t ]+', ' ', text)
101
+ text = re.sub('([,.!?;])', r'\1 $#!', text)
102
+ # split
103
+ sentences = [s.strip() for s in text.split('$#!')]
104
+ if len(sentences[-1]) == 0: del sentences[-1]
105
+
106
+ new_sentences = []
107
+ new_sent = []
108
+ count_len = 0
109
+ for ind, sent in enumerate(sentences):
110
+ # print(sent)
111
+ new_sent.append(sent)
112
+ count_len += len(sent.split(" "))
113
+ if count_len > min_len or ind == len(sentences) - 1:
114
+ count_len = 0
115
+ new_sentences.append(' '.join(new_sent))
116
+ new_sent = []
117
+ return merge_short_sentences_latin(new_sentences)
118
+
119
+
120
+ def merge_short_sentences_latin(sens):
121
+ """Avoid short sentences by merging them with the following sentence.
122
+
123
+ Args:
124
+ List[str]: list of input sentences.
125
+
126
+ Returns:
127
+ List[str]: list of output sentences.
128
+ """
129
+ sens_out = []
130
+ for s in sens:
131
+ # If the previous sentence is too short, merge them with
132
+ # the current sentence.
133
+ if len(sens_out) > 0 and len(sens_out[-1].split(" ")) <= 2:
134
+ sens_out[-1] = sens_out[-1] + " " + s
135
+ else:
136
+ sens_out.append(s)
137
+ try:
138
+ if len(sens_out[-1].split(" ")) <= 2:
139
+ sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
140
+ sens_out.pop(-1)
141
+ except:
142
+ pass
143
+ return sens_out
144
+
145
+ def split_sentences_zh(text, min_len=10):
146
+ text = re.sub('[。!?;]', '.', text)
147
+ text = re.sub('[,]', ',', text)
148
+ # 将文本中的换行符、空格和制表符替换为空格
149
+ text = re.sub('[\n\t ]+', ' ', text)
150
+ # 在标点符号后添加一个空格
151
+ text = re.sub('([,.!?;])', r'\1 $#!', text)
152
+ # 分隔句子并去除前后空格
153
+ # sentences = [s.strip() for s in re.split('(。|!|?|;)', text)]
154
+ sentences = [s.strip() for s in text.split('$#!')]
155
+ if len(sentences[-1]) == 0: del sentences[-1]
156
+
157
+ new_sentences = []
158
+ new_sent = []
159
+ count_len = 0
160
+ for ind, sent in enumerate(sentences):
161
+ new_sent.append(sent)
162
+ count_len += len(sent)
163
+ if count_len > min_len or ind == len(sentences) - 1:
164
+ count_len = 0
165
+ new_sentences.append(' '.join(new_sent))
166
+ new_sent = []
167
+ return merge_short_sentences_zh(new_sentences)
168
+
169
+
170
+ def merge_short_sentences_zh(sens):
171
+ # return sens
172
+ """Avoid short sentences by merging them with the following sentence.
173
+
174
+ Args:
175
+ List[str]: list of input sentences.
176
+
177
+ Returns:
178
+ List[str]: list of output sentences.
179
+ """
180
+ sens_out = []
181
+ for s in sens:
182
+ # If the previous sentense is too short, merge them with
183
+ # the current sentence.
184
+ if len(sens_out) > 0 and len(sens_out[-1]) <= 2:
185
+ sens_out[-1] = sens_out[-1] + " " + s
186
+ else:
187
+ sens_out.append(s)
188
+ try:
189
+ if len(sens_out[-1]) <= 2:
190
+ sens_out[-2] = sens_out[-2] + " " + sens_out[-1]
191
+ sens_out.pop(-1)
192
+ except:
193
+ pass
194
+ return sens_out