Create app.py
Browse files
app.py
ADDED
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from pathlib import Path
|
2 |
+
import utils
|
3 |
+
from models import SynthesizerTrn
|
4 |
+
import torch
|
5 |
+
from torch import no_grad, LongTensor
|
6 |
+
import librosa
|
7 |
+
from text import text_to_sequence, _clean_text
|
8 |
+
import commons
|
9 |
+
import scipy.io.wavfile as wavf
|
10 |
+
import os
|
11 |
+
|
12 |
+
import IPython.display as ipd
|
13 |
+
|
14 |
+
model_path = "./OUTPUT_MODEL/G_Amitaro.pth"
|
15 |
+
config_path = "./OUTPUT_MODEL/config.json"
|
16 |
+
|
17 |
+
length = 1.0
|
18 |
+
device = "cuda:0" if torch.cuda.is_available() else "cpu"
|
19 |
+
|
20 |
+
def get_text(text, hps, is_symbol):
|
21 |
+
text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
|
22 |
+
if hps.data.add_blank:
|
23 |
+
text_norm = commons.intersperse(text_norm, 0)
|
24 |
+
text_norm = LongTensor(text_norm)
|
25 |
+
return text_norm
|
26 |
+
|
27 |
+
def get_vits_array(text):
|
28 |
+
hps = utils.get_hparams_from_file(config_path)
|
29 |
+
net_g = SynthesizerTrn(
|
30 |
+
len(hps.symbols),
|
31 |
+
hps.data.filter_length // 2 + 1,
|
32 |
+
hps.train.segment_size // hps.data.hop_length,
|
33 |
+
n_speakers=hps.data.n_speakers,
|
34 |
+
**hps.model).to(device)
|
35 |
+
_ = net_g.eval()
|
36 |
+
_ = utils.load_checkpoint(model_path, net_g, None)
|
37 |
+
|
38 |
+
speaker_ids = hps.speakers
|
39 |
+
|
40 |
+
#text = "[JA]" + text + "[JA]"
|
41 |
+
speaker_id = 0
|
42 |
+
stn_tst = get_text(text, hps, False)
|
43 |
+
with no_grad():
|
44 |
+
x_tst = stn_tst.unsqueeze(0).to(device)
|
45 |
+
x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
|
46 |
+
sid = LongTensor([speaker_id]).to(device)
|
47 |
+
audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.6,
|
48 |
+
length_scale=1.0 / length)[0][0, 0].data.cpu().float().numpy()
|
49 |
+
del stn_tst, x_tst, x_tst_lengths, sid
|
50 |
+
|
51 |
+
ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
|
52 |
+
|
53 |
+
app = gr.Blocks()
|
54 |
+
with app:
|
55 |
+
gr.Markdown("# VITS-TTS-Japanese-Only-Amitaro\n\n"
|
56 |
+
"Sample usage of Finetune model -[Lycoris53/Vits-Japanese-Only-Amitaro](https://huggingface.co/Lycoris53/Vits-Japanese-Only-Amitaro) \n"
|
57 |
+
"Base finetuning code is from [Plachtaa/VITS-fast-fine-tuning](https://github.com/Plachtaa/VITS-fast-fine-tuning)"
|
58 |
+
)
|
59 |
+
with gr.Row():
|
60 |
+
with gr.Column():
|
61 |
+
textbox = gr.TextArea(label="Text",
|
62 |
+
placeholder="Type your sentence here (Maximum 150 words)",
|
63 |
+
value="おはようございます。何が御用でしょうか?")
|
64 |
+
with gr.Column():
|
65 |
+
audio_output = gr.Audio(label="Output Audio")
|
66 |
+
btn = gr.Button("Generate Voice!")
|
67 |
+
btn.click(get_vits_array,
|
68 |
+
inputs=[textbox],
|
69 |
+
outputs=[audio_output])
|
70 |
+
|
71 |
+
app.queue(concurrency_count=3).launch()
|