Lycoris53 commited on
Commit
b44d606
·
1 Parent(s): b1e9e5d

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -0
app.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from pathlib import Path
2
+ import utils
3
+ from models import SynthesizerTrn
4
+ import torch
5
+ from torch import no_grad, LongTensor
6
+ import librosa
7
+ from text import text_to_sequence, _clean_text
8
+ import commons
9
+ import scipy.io.wavfile as wavf
10
+ import os
11
+
12
+ import IPython.display as ipd
13
+
14
+ model_path = "./OUTPUT_MODEL/G_Amitaro.pth"
15
+ config_path = "./OUTPUT_MODEL/config.json"
16
+
17
+ length = 1.0
18
+ device = "cuda:0" if torch.cuda.is_available() else "cpu"
19
+
20
+ def get_text(text, hps, is_symbol):
21
+ text_norm = text_to_sequence(text, hps.symbols, [] if is_symbol else hps.data.text_cleaners)
22
+ if hps.data.add_blank:
23
+ text_norm = commons.intersperse(text_norm, 0)
24
+ text_norm = LongTensor(text_norm)
25
+ return text_norm
26
+
27
+ def get_vits_array(text):
28
+ hps = utils.get_hparams_from_file(config_path)
29
+ net_g = SynthesizerTrn(
30
+ len(hps.symbols),
31
+ hps.data.filter_length // 2 + 1,
32
+ hps.train.segment_size // hps.data.hop_length,
33
+ n_speakers=hps.data.n_speakers,
34
+ **hps.model).to(device)
35
+ _ = net_g.eval()
36
+ _ = utils.load_checkpoint(model_path, net_g, None)
37
+
38
+ speaker_ids = hps.speakers
39
+
40
+ #text = "[JA]" + text + "[JA]"
41
+ speaker_id = 0
42
+ stn_tst = get_text(text, hps, False)
43
+ with no_grad():
44
+ x_tst = stn_tst.unsqueeze(0).to(device)
45
+ x_tst_lengths = LongTensor([stn_tst.size(0)]).to(device)
46
+ sid = LongTensor([speaker_id]).to(device)
47
+ audio = net_g.infer(x_tst, x_tst_lengths, sid=sid, noise_scale=.667, noise_scale_w=0.6,
48
+ length_scale=1.0 / length)[0][0, 0].data.cpu().float().numpy()
49
+ del stn_tst, x_tst, x_tst_lengths, sid
50
+
51
+ ipd.display(ipd.Audio(audio, rate=hps.data.sampling_rate, normalize=False))
52
+
53
+ app = gr.Blocks()
54
+ with app:
55
+ gr.Markdown("# VITS-TTS-Japanese-Only-Amitaro\n\n"
56
+ "Sample usage of Finetune model -[Lycoris53/Vits-Japanese-Only-Amitaro](https://huggingface.co/Lycoris53/Vits-Japanese-Only-Amitaro) \n"
57
+ "Base finetuning code is from [Plachtaa/VITS-fast-fine-tuning](https://github.com/Plachtaa/VITS-fast-fine-tuning)"
58
+ )
59
+ with gr.Row():
60
+ with gr.Column():
61
+ textbox = gr.TextArea(label="Text",
62
+ placeholder="Type your sentence here (Maximum 150 words)",
63
+ value="おはようございます。何が御用でしょうか?")
64
+ with gr.Column():
65
+ audio_output = gr.Audio(label="Output Audio")
66
+ btn = gr.Button("Generate Voice!")
67
+ btn.click(get_vits_array,
68
+ inputs=[textbox],
69
+ outputs=[audio_output])
70
+
71
+ app.queue(concurrency_count=3).launch()