konverner commited on
Commit
899cf32
·
0 Parent(s):

Initial commit

Browse files
.gitignore ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Initially taken from Github's Python gitignore file
2
+
3
+ # Byte-compiled / optimized / DLL files
4
+ __pycache__/
5
+ *.py[cod]
6
+ *$py.class
7
+
8
+ # C extensions
9
+ *.so
10
+
11
+ # tests and logs
12
+ tests/fixtures/cached_*_text.txt
13
+ logs/
14
+ lightning_logs/
15
+ lang_code_data/
16
+
17
+ # Distribution / packaging
18
+ .Python
19
+ build/
20
+ develop-eggs/
21
+ dist/
22
+ downloads/
23
+ eggs/
24
+ .eggs/
25
+ lib/
26
+ lib64/
27
+ parts/
28
+ sdist/
29
+ var/
30
+ wheels/
31
+ *.egg-info/
32
+ .installed.cfg
33
+ *.egg
34
+ MANIFEST
35
+
36
+ # PyInstaller
37
+ # Usually these files are written by a python script from a template
38
+ # before PyInstaller builds the exe, so as to inject date/other infos into it.
39
+ *.manifest
40
+ *.spec
41
+
42
+ # Installer logs
43
+ pip-log.txt
44
+ pip-delete-this-directory.txt
45
+
46
+ # Unit test / coverage reports
47
+ htmlcov/
48
+ .tox/
49
+ .nox/
50
+ .coverage
51
+ .coverage.*
52
+ .cache
53
+ nosetests.xml
54
+ coverage.xml
55
+ *.cover
56
+ .hypothesis/
57
+ .pytest_cache/
58
+
59
+ # Translations
60
+ *.mo
61
+ *.pot
62
+
63
+ # Django stuff:
64
+ *.log
65
+ local_settings.py
66
+ db.sqlite3
67
+
68
+ # Flask stuff:
69
+ instance/
70
+ .webassets-cache
71
+
72
+ # Scrapy stuff:
73
+ .scrapy
74
+
75
+ # Sphinx documentation
76
+ docs/_build/
77
+
78
+ # PyBuilder
79
+ target/
80
+
81
+ # Jupyter Notebook
82
+ .ipynb_checkpoints
83
+
84
+ # IPython
85
+ profile_default/
86
+ ipython_config.py
87
+
88
+ # pyenv
89
+ .python-version
90
+
91
+ # celery beat schedule file
92
+ celerybeat-schedule
93
+
94
+ # SageMath parsed files
95
+ *.sage.py
96
+
97
+ # Environments
98
+ .env
99
+ .venv
100
+ env/
101
+ venv/
102
+ ENV/
103
+ env.bak/
104
+ venv.bak/
105
+
106
+ # Spyder project settings
107
+ .spyderproject
108
+ .spyproject
109
+
110
+ # Rope project settings
111
+ .ropeproject
112
+
113
+ # mkdocs documentation
114
+ /site
115
+
116
+ # mypy
117
+ .mypy_cache/
118
+ .dmypy.json
119
+ dmypy.json
120
+
121
+ # Pyre type checker
122
+ .pyre/
123
+
124
+ # vscode
125
+ .vs
126
+ .vscode
127
+
128
+ # Pycharm
129
+ .idea
130
+
131
+ # TF code
132
+ tensorflow_code
133
+
134
+ # Models
135
+ proc_data
136
+
137
+ # examples
138
+ runs
139
+ /runs_old
140
+ /wandb
141
+ /examples/runs
142
+ /examples/**/*.args
143
+ /examples/rag/sweep
144
+
145
+ # data
146
+ /data
147
+ serialization_dir
148
+
149
+ # emacs
150
+ *.*~
151
+ debug.env
152
+
153
+ # vim
154
+ .*.swp
155
+
156
+ #ctags
157
+ tags
158
+
159
+ # pre-commit
160
+ .pre-commit*
161
+
162
+ # .lock
163
+ *.lock
164
+
165
+ # DS_Store (MacOS)
166
+ .DS_Store
167
+
168
+ # ruff
169
+ .ruff_cache
LICENSE ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2023 Konstantin Verner
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
README.md ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Few-Shot Voice Cloning
2
+
3
+ This repository is an implementation of the pipeline for few-short voice cloning based on SpeechT5 architecture introduced in [ SpeechT5: Unified-Modal Encoder-Decoder Pre-Training for Spoken Language Processing](https://arxiv.org/abs/2110.07205).
4
+ It is able to clone a voice from 15-30 seconds of audio recording in English (another languages are planned).
5
+
6
+ # Getting Started
7
+
8
+ Clone repository
9
+ ```angular2html
10
+ git clone https://github.com/konverner/deep-voice-cloning.git
11
+ ```
12
+
13
+ Install the modules
14
+ ```angular2html
15
+ pip install .
16
+ ```
17
+
18
+ Run traning specifying arguments using config file `training_config.json` or the console command, for example
19
+ ```angular2html
20
+ python scripts/train.py --audio_path scripts/input/hank.mp3 --output_dir /content/deep-voice-cloning/models
21
+ ```
22
+ Resulting model will be saved in `output_dir` directory. It will be used in the next step.
23
+
24
+ Run inference specifying arguments using config file `inference_config.json` or the console command, for example
25
+ ```angular2html
26
+ python scripts/cloning_inference.py --model_path "/content/deep-voice-cloning/models/microsoft_speecht5_tts_hank"\
27
+ --input_text 'do the things, not because they are easy, but because they are hard'\
28
+ --output_path "scripts/output/do_the_things.wav"
29
+ ```
30
+
31
+ Resulting audio file will be saved as `output_path` file.
models/.gitkeep ADDED
File without changes
notebooks/.gitkeep ADDED
File without changes
scripts/cloning_inference.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+
5
+ import soundfile as sf
6
+
7
+ from deep_voice_cloning.cloning.model import CloningModel
8
+
9
+
10
+ if __name__ == "__main__":
11
+ parser = argparse.ArgumentParser()
12
+ parser.add_argument("--model_path", type=str, default=None, help="Path to model directory")
13
+ parser.add_argument("--input_text", type=str, default=None, help="Text to be synthesized")
14
+ parser.add_argument("--output_path", type=str, default=None, help="Path to output audio file")
15
+ args = parser.parse_args()
16
+
17
+ with open(os.path.join(os.path.dirname(__file__), "inference_config.json")) as f:
18
+ config = json.load(f)
19
+
20
+ if args.model_path is not None:
21
+ config['model_path'] = args.model_path
22
+ if args.input_text is not None:
23
+ config['input_text'] = args.input_text
24
+ if args.output_path is not None:
25
+ config['output_path'] = args.output_path
26
+
27
+ cloning_model = CloningModel(config)
28
+ waveform_array = cloning_model.forward(config["input_text"])
29
+
30
+ sf.write(config['output_path'], waveform_array, samplerate=16000)
scripts/inference_config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_path": "/content/deep-voice-cloning/models/microsoft_speecht5_tts_hank_hill",
3
+ "speaker_model_name": "speechbrain/spkrec-xvect-voxceleb",
4
+ "vocoder_name": "microsoft/speecht5_hifigan",
5
+ "input_text": "do the things, not because they are easy, but because they are hard",
6
+ "output_path": "/content/deep-voice-cloning/scripts/output/do_the_things.wav"
7
+ }
scripts/input/hank.mp3 ADDED
Binary file (526 kB). View file
 
scripts/input/homer.mp3 ADDED
Binary file (913 kB). View file
 
scripts/output/.gitkeep ADDED
File without changes
scripts/train.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import json
3
+ import os
4
+
5
+ import torch
6
+ from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
7
+
8
+ from deep_voice_cloning.cloning.model import CloningModel
9
+ from deep_voice_cloning.transcriber.model import TranscriberModel
10
+ from deep_voice_cloning.data.collator import TTSDataCollatorWithPadding
11
+ from deep_voice_cloning.data.dataset import get_cloning_dataset
12
+
13
+
14
+ if __name__ == "__main__":
15
+ parser = argparse.ArgumentParser()
16
+ parser.add_argument("--lang", type=str, default=None, help="Language of speech samples")
17
+ parser.add_argument("--audio_path", type=str, default=None, help="Path to training audio file")
18
+ parser.add_argument("--output_dir", type=str, default=None, help="Path to output directory for trained model")
19
+ args = parser.parse_args()
20
+
21
+ with open(os.path.join(os.path.dirname(__file__), "training_config.json")) as f:
22
+ training_config = json.load(f)
23
+
24
+ if args.lang is not None:
25
+ training_config['lang'] = args.lang
26
+ if args.audio_path is not None:
27
+ training_config['audio_path'] = args.audio_path
28
+ if args.output_dir is not None:
29
+ training_config['output_dir'] = args.output_dir
30
+
31
+ transcriber_model = TranscriberModel(lang=training_config['lang'])
32
+ cloning_model = CloningModel(lang=training_config['lang'])
33
+
34
+ dataset = get_cloning_dataset(training_config['audio_path'], transcriber_model, cloning_model)
35
+ data_collator = TTSDataCollatorWithPadding(processor=cloning_model.processor, model=cloning_model.model)
36
+
37
+ training_args = Seq2SeqTrainingArguments(
38
+ output_dir=training_config["output_dir"],
39
+ per_device_train_batch_size=training_config['batch_size'],
40
+ gradient_accumulation_steps=2,
41
+ overwrite_output_dir=True,
42
+ learning_rate=training_config['learning_rate'],
43
+ warmup_steps=training_config['warmup_steps'],
44
+ max_steps=training_config['max_steps'],
45
+ gradient_checkpointing=True,
46
+ fp16=transcriber_model.device == torch.device("cuda"),
47
+ evaluation_strategy="steps",
48
+ per_device_eval_batch_size=8,
49
+ save_strategy="no",
50
+ eval_steps=100,
51
+ logging_steps=20,
52
+ load_best_model_at_end=False,
53
+ greater_is_better=False,
54
+ label_names=["labels"],
55
+ )
56
+
57
+ trainer = Seq2SeqTrainer(
58
+ args=training_args,
59
+ model=cloning_model.model,
60
+ train_dataset=dataset,
61
+ eval_dataset=dataset,
62
+ data_collator=data_collator,
63
+ tokenizer=cloning_model.processor.tokenizer,
64
+ )
65
+
66
+ trainer.train()
67
+ cloning_model.save_pretrained(training_config["output_dir"] +\
68
+ '/' + cloning_model.config['model_path'].replace('/', '_') +\
69
+ '_' + training_config['audio_path'].split('/')[-1].split('.')[0])
scripts/training_config.json ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "audio_path": "/content/deep-voice-cloning/scripts/input/hank_hill.mp3",
3
+ "output_dir": "/content/deep-voice-cloning/models",
4
+ "lang": "en",
5
+ "batch_size": 2,
6
+ "learning_rate": 1e-4,
7
+ "max_steps": 1500,
8
+ "warmup_steps": 250
9
+ }
setup.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+
3
+ from setuptools import find_packages, setup
4
+
5
+ README_TEXT = (Path(__file__).parent / "README.md").read_text(encoding="utf-8")
6
+
7
+ MAINTAINER = "Konstantin Verner"
8
+ MAINTAINER_EMAIL = "[email protected]"
9
+ REQUIRED_PKGS = ["accelerate==0.21.0",
10
+ "aiohttp==3.8.4",
11
+ "aiosignal==1.3.1",
12
+ "appdirs==1.4.4",
13
+ "async-timeout==4.0.2",
14
+ "attrs==23.1.0",
15
+ "audioread==3.0.0",
16
+ "certifi==2023.5.7",
17
+ "cffi==1.15.1",
18
+ "charset-normalizer==3.2.0",
19
+ "colorama==0.4.6",
20
+ "datasets==2.13.1",
21
+ "decorator>=4.0.2",
22
+ "dill==0.3.6",
23
+ "filelock==3.12.2",
24
+ "frozenlist==1.4.0",
25
+ "fsspec==2023.6.0",
26
+ "huggingface-hub==0.16.4",
27
+ "HyperPyYAML==1.2.1",
28
+ "idna==3.4",
29
+ "Jinja2==3.1.2",
30
+ "joblib==1.3.1",
31
+ "lazy_loader==0.3",
32
+ "librosa==0.10.0.post2",
33
+ "llvmlite==0.40.1",
34
+ "MarkupSafe==2.1.3",
35
+ "mpmath==1.3.0",
36
+ "msgpack==1.0.5",
37
+ "multidict==6.0.4",
38
+ "multiprocess==0.70.14",
39
+ "networkx==3.1",
40
+ "numba==0.57.1",
41
+ "numpy>=1.22",
42
+ "packaging==23.1",
43
+ "pandas>=1.5.3",
44
+ "pooch==1.6.0",
45
+ "psutil==5.9.5",
46
+ "pyarrow>=3.0.0",
47
+ "pycparser==2.21",
48
+ "python-dateutil==2.8.2",
49
+ "pytz==2023.3",
50
+ "PyYAML==6.0",
51
+ "ruamel.yaml==0.17.28",
52
+ "ruamel.yaml.clib==0.2.7",
53
+ "safetensors==0.3.1",
54
+ "scikit-learn==1.3.0",
55
+ "scipy==1.11.1",
56
+ "sentencepiece==0.1.99",
57
+ "six==1.16.0",
58
+ "soundfile==0.12.1",
59
+ "soxr==0.3.5",
60
+ "speechbrain==0.5.14",
61
+ "sympy==1.12",
62
+ "threadpoolctl==3.2.0",
63
+ "tokenizers==0.13.3",
64
+ "torch==2.0.1",
65
+ "torchaudio==2.0.2",
66
+ "tqdm==4.65.0",
67
+ "transformers==4.30.2",
68
+ "typing_extensions==4.7.1",
69
+ "tzdata==2023.3",
70
+ "urllib3==2.0.3",
71
+ "xxhash==3.2.0",
72
+ "yarl==1.9.2"]
73
+
74
+ print(find_packages("src"))
75
+
76
+ setup(
77
+ name="deep_voice_cloning",
78
+ version="0.1.0",
79
+ description="Few-Shot Voice Cloning",
80
+ long_description=README_TEXT,
81
+ long_description_content_type="text/markdown",
82
+ maintainer=MAINTAINER,
83
+ maintainer_email=MAINTAINER_EMAIL,
84
+ url="",
85
+ download_url="",
86
+ license="MIT",
87
+ package_dir={"": "src"},
88
+ packages=find_packages("src"),
89
+ include_package_data=True,
90
+ package_data={"": ["*.json"]},
91
+ install_requires=REQUIRED_PKGS,
92
+ classifiers=[
93
+ "Development Status :: 1 - Planning",
94
+ "Intended Audience :: Developers",
95
+ "Intended Audience :: Education",
96
+ "Intended Audience :: Science/Research",
97
+ "License :: OSI Approved :: MIT",
98
+ "Operating System :: OS Independent",
99
+ "Programming Language :: Python :: 3",
100
+ "Programming Language :: Python :: 3.8",
101
+ "Programming Language :: Python :: 3.9",
102
+ "Topic :: Scientific/Engineering :: Artificial Intelligence",
103
+ ],
104
+ keywords="asr, machine learning, fewshot learning, transformers",
105
+ zip_safe=False, # Required for mypy to find the py.typed file
106
+ )
src/deep_voice_cloning/__init__.py ADDED
File without changes
src/deep_voice_cloning/cloning/__init__.py ADDED
File without changes
src/deep_voice_cloning/cloning/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "en": {
3
+ "model_path": "microsoft/speecht5_tts",
4
+ "vocoder_name": "microsoft/speecht5_hifigan",
5
+ "speaker_model_name": "speechbrain/spkrec-xvect-voxceleb"
6
+ }
7
+ }
src/deep_voice_cloning/cloning/model.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ from typing import Dict
4
+
5
+ import numpy as np
6
+ import torch
7
+ from speechbrain.pretrained import EncoderClassifier
8
+ from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
9
+
10
+
11
+ class CloningModel:
12
+ def __init__(self, config: Dict[str, Dict[str, str]] = None, lang: str = 'en'):
13
+ super(CloningModel, self).__init__()
14
+ if config is None:
15
+ self.speaker_embedding = None
16
+ with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
17
+ self.config = json.load(f)[lang]
18
+ else:
19
+ self.config = config
20
+ self.speaker_embedding = torch.load(self.config['model_path'] + "/speaker_embedding.pt")[0]
21
+ self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
22
+ self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
23
+ self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
24
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
25
+ self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
26
+ self.to(self.device)
27
+
28
+ def to(self, device: torch.device):
29
+ self.model = self.model.to(device)
30
+ self.vocoder = self.vocoder.to(device)
31
+
32
+ def save_pretrained(self, save_directory: str):
33
+ self.model.save_pretrained(save_directory)
34
+ self.processor.save_pretrained(save_directory)
35
+ torch.save(self.speaker_embedding, save_directory + "/speaker_embedding.pt")
36
+
37
+ def forward(self, text: str) -> np.array:
38
+ # tokenize text
39
+ inputs = self.processor(text=text, return_tensors="pt")
40
+ # generate spectrogram using backbone model
41
+ spectrogram = self.model.generate_speech(inputs["input_ids"].to(self.device),
42
+ self.speaker_embedding.to(self.device))
43
+ # decode spectrogram into waveform using vocoder
44
+ with torch.no_grad():
45
+ waveform_array = self.vocoder(spectrogram).detach().cpu().numpy()
46
+ return waveform_array
47
+
48
+ def create_speaker_embedding(self, waveform: torch.tensor) -> torch.tensor:
49
+ with torch.no_grad():
50
+ speaker_embeddings = self.speaker_model.encode_batch(waveform)
51
+ speaker_embeddings = torch.nn.functional.normalize(speaker_embeddings, dim=2)
52
+ self.speaker_embedding = speaker_embeddings
53
+ speaker_embeddings = speaker_embeddings.squeeze()
54
+ return speaker_embeddings
src/deep_voice_cloning/data/__init__.py ADDED
File without changes
src/deep_voice_cloning/data/collator.py ADDED
@@ -0,0 +1,45 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from typing import Any, Dict, List, Union
3
+
4
+
5
+ class TTSDataCollatorWithPadding:
6
+
7
+ def __init__(self, model, processor):
8
+ self.model = model
9
+ self.processor = processor
10
+
11
+ def __call__(self, features: List[Dict[str, Union[List[int], torch.Tensor]]]) -> Dict[str, torch.Tensor]:
12
+ input_ids = [{"input_ids": feature["input_ids"]} for feature in features]
13
+ label_features = [{"input_values": feature["labels"]} for feature in features]
14
+ speaker_features = [feature["speaker_embeddings"] for feature in features]
15
+
16
+ # collate the inputs and targets into a batch
17
+ batch = self.processor.pad(
18
+ input_ids=input_ids,
19
+ labels=label_features,
20
+ return_tensors="pt",
21
+ )
22
+
23
+ # replace padding with -100 to ignore loss correctly
24
+ batch["labels"] = batch["labels"].masked_fill(
25
+ batch.decoder_attention_mask.unsqueeze(-1).ne(1), -100
26
+ )
27
+
28
+ # not used during fine-tuning
29
+ del batch["decoder_attention_mask"]
30
+
31
+ # round down target lengths to multiple of reduction factor
32
+ if self.model.config.reduction_factor > 1:
33
+ target_lengths = torch.tensor([
34
+ len(feature["input_values"]) for feature in label_features
35
+ ])
36
+ target_lengths = target_lengths.new([
37
+ length - length % self.model.config.reduction_factor for length in target_lengths
38
+ ])
39
+ max_length = max(target_lengths)
40
+ batch["labels"] = batch["labels"][:, :max_length]
41
+
42
+ # add the speaker embeddings
43
+ batch["speaker_embeddings"] = torch.tensor(speaker_features)
44
+
45
+ return batch
src/deep_voice_cloning/data/dataset.py ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+
3
+ import torch
4
+ import librosa
5
+ import numpy as np
6
+ from datasets import Dataset
7
+
8
+ from ..cloning.model import CloningModel
9
+ from ..transcriber.model import TranscriberModel
10
+
11
+
12
+ def prepare_dataset(example: Dict[str, Any], model: CloningModel) -> Dict[str, Any]:
13
+ """
14
+ Prepare a single example for training
15
+ """
16
+ # feature extraction and tokenization
17
+ processed_example = model.processor(
18
+ text=example["normalized_text"],
19
+ audio_target=example["audio"]["array"],
20
+ sampling_rate=16000,
21
+ return_attention_mask=False,
22
+ )
23
+
24
+ # strip off the batch dimension
25
+ if len(torch.tensor(processed_example['input_ids']).shape) > 1:
26
+ processed_example['input_ids'] = processed_example['input_ids'][0]
27
+
28
+ processed_example["labels"] = processed_example["labels"][0]
29
+
30
+ # use SpeechBrain to obtain x-vector
31
+ processed_example["speaker_embeddings"] = model.create_speaker_embedding(
32
+ torch.tensor(example["audio"]["array"])
33
+ ).numpy()
34
+
35
+ return processed_example
36
+
37
+
38
+ def get_cloning_dataset(input_audio_path: str,
39
+ transcriber_model: TranscriberModel,
40
+ cloning_model: CloningModel,
41
+ sampling_rate: int = 16000,
42
+ window_size_secs: int = 5) -> Dataset:
43
+ """
44
+ Create dataset by transcribing an audio file using a pretrained Wav2Vec2 model.
45
+ """
46
+ speech_array, _ = librosa.load(input_audio_path, sr=sampling_rate)
47
+
48
+ # split a waveform into splits of 5 secs each
49
+ speech_arrays = np.split(speech_array, range(0, len(speech_array), window_size_secs * sampling_rate))[1:]
50
+ texts = [transcriber_model.forward(speech_array, sampling_rate=sampling_rate)
51
+ for speech_array in speech_arrays]
52
+
53
+ dataset = Dataset.from_list([
54
+ {'audio': {'array': speech_arrays[i]}, 'normalized_text': texts[i]}
55
+ for i in range(len(speech_arrays))]
56
+ )
57
+
58
+ dataset = dataset.map(
59
+ prepare_dataset, fn_kwargs={'model': cloning_model},
60
+ remove_columns=dataset.column_names,
61
+ )
62
+
63
+ return dataset
src/deep_voice_cloning/transcriber/__init__.py ADDED
File without changes
src/deep_voice_cloning/transcriber/config.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "language_model_names": {
3
+ "en": "jonatasgrosman/wav2vec2-large-xlsr-53-english",
4
+ "fr": "jonatasgrosman/wav2vec2-large-xlsr-53-french",
5
+ "de": "jonatasgrosman/wav2vec2-large-xlsr-53-german"
6
+ }
7
+ }
src/deep_voice_cloning/transcriber/model.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+
4
+ import numpy as np
5
+ import torch
6
+ from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC
7
+
8
+
9
+ class TranscriberModel:
10
+ def __init__(self, lang: str = 'en'):
11
+ with open(os.path.join(os.path.dirname(__file__), 'config.json')) as f:
12
+ config = json.load(f)
13
+ self.processor = Wav2Vec2Processor.from_pretrained(config['language_model_names'][lang])
14
+ self.model = Wav2Vec2ForCTC.from_pretrained(config['language_model_names'][lang])
15
+ self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
16
+
17
+ def forward(self, speech_array: np.array, sampling_rate: int = 16000) -> str:
18
+ model_input = self.processor(speech_array, sampling_rate=sampling_rate, return_tensors="pt", padding=True)
19
+ with torch.no_grad():
20
+ logits = self.model(model_input.input_values, attention_mask=model_input.attention_mask).logits
21
+ predicted_ids = torch.argmax(logits, dim=-1)
22
+ return self.processor.batch_decode(predicted_ids)