konverner commited on
Commit
b8d1b27
·
1 Parent(s): c5f4f8c

gradio support updated

Browse files
README.md CHANGED
@@ -28,4 +28,42 @@ python scripts/cloning_inference.py --model_path "/content/deep-voice-cloning/mo
28
  --output_path "scripts/output/do_the_things.wav"
29
  ```
30
 
31
- Resulting audio file will be saved as `output_path` file.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  --output_path "scripts/output/do_the_things.wav"
29
  ```
30
 
31
+ Resulting audio file will be saved as `output_path` file.
32
+
33
+ # Docker
34
+
35
+ To build docker image:
36
+
37
+ ```
38
+ docker build -t deep-voice-cloning .
39
+ ```
40
+
41
+ To pull docker image from Hub:
42
+
43
+ ```angular2html
44
+ docker pull konverner/deep-voice-cloning:latest
45
+ ```
46
+
47
+ To run image in a container:
48
+
49
+ ```
50
+ docker run -it --entrypoint=/bin/bash konverner/deep-voice-cloning
51
+ ```
52
+
53
+ To run training in a container for example:
54
+
55
+ ```
56
+ python scripts/train.py --audio_path scripts/input/hank.mp3 --output_dir models
57
+ ```
58
+
59
+ To run inference in a container for example:
60
+
61
+ ```
62
+ python scripts/cloning_inference.py --model_path models/microsoft_speecht5_tts_hank --input_text "do the things, not because they are easy, but because they are hard" --output_path scripts/output/do_the_things.wav
63
+ ```
64
+
65
+
66
+ # Notebook Examples
67
+
68
+ Example of using CLI for training and inference can be found in [notebook](https://github.com/konverner/deep-voice-cloning/blob/main/notebooks/CLI_Example.ipynb)
69
+
app.py ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import gradio as gr
5
+
6
+
7
+ def greet(text, audio_file_path):
8
+ text = "%s" % text
9
+ audio_file_path = "%s" % audio_file_path
10
+ out_path = Path("scripts/output/audio.wav")
11
+ os.system(f'python scripts/train.py --audio_path {audio_file_path}\
12
+ --output_dir "models"')
13
+ os.system(f'python scripts/cloning_inference.py --model_path "models/microsoft_speecht5_tts_{Path(audio_file_path).stem}"\
14
+ --input_text "{text}" --output_path "{str(out_path)}"')
15
+ return out_path
16
+
17
+
18
+ demo = gr.Interface(
19
+ fn=greet,
20
+ inputs=[gr.Textbox(label='What would you like the voice to say? (max. 2000 characters per request)'),
21
+ gr.Audio(type="filepath", source="upload", label='Upload a voice to clone (max. 50mb)')],
22
+ outputs="audio",
23
+ title="Deep Voice Cloning Tool"
24
+ )
25
+ demo.launch()
scripts/train.py CHANGED
@@ -1,6 +1,7 @@
1
  import argparse
2
  import json
3
  import os
 
4
 
5
  import torch
6
  from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
@@ -24,9 +25,9 @@ if __name__ == "__main__":
24
  if args.lang is not None:
25
  training_config['lang'] = args.lang
26
  if args.audio_path is not None:
27
- training_config['audio_path'] = args.audio_path
28
  if args.output_dir is not None:
29
- training_config['output_dir'] = args.output_dir
30
 
31
  transcriber_model = TranscriberModel(lang=training_config['lang'])
32
  cloning_model = CloningModel(lang=training_config['lang'])
@@ -64,6 +65,6 @@ if __name__ == "__main__":
64
  )
65
 
66
  trainer.train()
67
- cloning_model.save_pretrained(training_config["output_dir"] +\
68
- '/' + cloning_model.config['model_path'].replace('/', '_') +\
69
- '_' + training_config['audio_path'].split('/')[-1].split('.')[0])
 
1
  import argparse
2
  import json
3
  import os
4
+ from pathlib import Path
5
 
6
  import torch
7
  from transformers import Seq2SeqTrainingArguments, Seq2SeqTrainer
 
25
  if args.lang is not None:
26
  training_config['lang'] = args.lang
27
  if args.audio_path is not None:
28
+ training_config['audio_path'] = Path(args.audio_path)
29
  if args.output_dir is not None:
30
+ training_config['output_dir'] = Path(args.output_dir)
31
 
32
  transcriber_model = TranscriberModel(lang=training_config['lang'])
33
  cloning_model = CloningModel(lang=training_config['lang'])
 
65
  )
66
 
67
  trainer.train()
68
+ cloning_model.save_pretrained(Path(training_config["output_dir"]) /
69
+ Path(cloning_model.config['model_path'].replace('/', '_')) +\
70
+ '_' + Path(training_config['audio_path']).stem)
scripts/training_config.json CHANGED
@@ -4,6 +4,6 @@
4
  "lang": "en",
5
  "batch_size": 2,
6
  "learning_rate": 1e-4,
7
- "max_steps": 1500,
8
- "warmup_steps": 250
9
  }
 
4
  "lang": "en",
5
  "batch_size": 2,
6
  "learning_rate": 1e-4,
7
+ "max_steps": 15,
8
+ "warmup_steps": 2
9
  }
src/deep_voice_cloning/cloning/model.py CHANGED
@@ -1,6 +1,7 @@
1
  import os
2
  import json
3
  from typing import Dict
 
4
 
5
  import numpy as np
6
  import torch
@@ -17,7 +18,7 @@ class CloningModel:
17
  self.config = json.load(f)[lang]
18
  else:
19
  self.config = config
20
- self.speaker_embedding = torch.load(self.config['model_path'] + "/speaker_embedding.pt")[0]
21
  self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
22
  self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
23
  self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
@@ -25,6 +26,8 @@ class CloningModel:
25
  self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
26
  self.to(self.device)
27
 
 
 
28
  def to(self, device: torch.device):
29
  self.model = self.model.to(device)
30
  self.vocoder = self.vocoder.to(device)
@@ -32,7 +35,7 @@ class CloningModel:
32
  def save_pretrained(self, save_directory: str):
33
  self.model.save_pretrained(save_directory)
34
  self.processor.save_pretrained(save_directory)
35
- torch.save(self.speaker_embedding, save_directory + "/speaker_embedding.pt")
36
 
37
  def forward(self, text: str) -> np.array:
38
  # tokenize text
 
1
  import os
2
  import json
3
  from typing import Dict
4
+ from pathlib import Path
5
 
6
  import numpy as np
7
  import torch
 
18
  self.config = json.load(f)[lang]
19
  else:
20
  self.config = config
21
+ self.speaker_embedding = torch.load(Path(self.config['model_path']) / "speaker_embedding.pt")[0]
22
  self.processor = SpeechT5Processor.from_pretrained(self.config['model_path'])
23
  self.model = SpeechT5ForTextToSpeech.from_pretrained(self.config['model_path'])
24
  self.vocoder = SpeechT5HifiGan.from_pretrained(self.config['vocoder_name'])
 
26
  self.speaker_model = EncoderClassifier.from_hparams(source=self.config['speaker_model_name'])
27
  self.to(self.device)
28
 
29
+
30
+
31
  def to(self, device: torch.device):
32
  self.model = self.model.to(device)
33
  self.vocoder = self.vocoder.to(device)
 
35
  def save_pretrained(self, save_directory: str):
36
  self.model.save_pretrained(save_directory)
37
  self.processor.save_pretrained(save_directory)
38
+ torch.save(self.speaker_embedding, Path(save_directory) / "speaker_embedding.pt")
39
 
40
  def forward(self, text: str) -> np.array:
41
  # tokenize text