File size: 2,903 Bytes
e95bc25
 
15e7b85
e95bc25
2c32692
 
 
e95bc25
 
 
4e602d2
 
 
 
 
 
845cab6
e95bc25
 
 
884068b
 
e707d6a
884068b
e95bc25
 
 
96fabd6
15e7b85
4e602d2
e707d6a
e95bc25
 
15e7b85
e95bc25
15e7b85
 
e95bc25
15e7b85
 
4e602d2
 
 
15e7b85
4e602d2
f194fff
 
 
 
4e602d2
f194fff
96fabd6
f194fff
e95bc25
15e7b85
4e602d2
15e7b85
 
 
 
e95bc25
15e7b85
e95bc25
 
 
 
 
15e7b85
 
e95bc25
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import gradio as gr
import tempfile
from TTS.api import TTS
from huggingface_hub import hf_hub_download
import torch

CUDA = torch.cuda.is_available()

REPO_ID = "ayymen/Coqui-TTS-Vits-shi"

VOICE_CONVERSION_MODELS = {
    'freevc24': 'voice_conversion_models/multilingual/vctk/freevc24'
    'openvoice_v1': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v1',
    'openvoice_v2': 'voice_conversion_models/multilingual/multi-dataset/openvoice_v2',
}

my_title = "ⴰⴹⵕⵉⵚ ⵙ ⵉⵎⵙⵍⵉ - Tamazight Text-to-Speech"
my_description = "This model is based on [VITS](https://github.com/jaywalnut310/vits), thanks to 🐸 [Coqui.ai](https://coqui.ai/)." 

my_examples = [
  ["ⴰⵣⵓⵍ. ⵎⴰⵏⵣⴰⴽⵉⵏ?"],
  ["ⵡⴰ ⵜⴰⵎⵖⴰⵔⵜ ⵎⴰ ⴷ ⵓⴽⴰⵏ ⵜⵙⴽⵔⵜ?"],
  ["ⴳⵏ ⴰⴷ ⴰⴽ ⵉⵙⵙⴳⵏ ⵕⴱⴱⵉ ⵉⵜⵜⵓ ⴽ."],
  ["ⴰⵔⵔⴰⵡ ⵏ ⵍⵀⵎⵎ ⵢⵓⴽⵔ ⴰⵖ ⵉⵀⴷⵓⵎⵏ ⵏⵏⵖ!"]
]

my_inputs = [
  gr.Textbox(lines=5, label="Input Text", placeholder="The only available characters are: ⴰⴱⴳⴷⴹⴻⴼⴽⵀⵃⵄⵅⵇⵉⵊⵍⵎⵏⵓⵔⵕⵖⵙⵚⵛⵜⵟⵡⵢⵣⵥⵯ !,.:?"),
  gr.Audio(type="filepath", label="Speaker audio for voice cloning (optional)"),
  gr.Dropdown(label="Voice Conversion Model", choices=list(VOICE_CONVERSION_MODELS.keys())),
  gr.Checkbox(label="Split Sentences (each sentence will be generated separately)", value=True)
]

my_outputs = gr.Audio(type="filepath", label="Output Audio", autoplay=True)

best_model_path = hf_hub_download(repo_id=REPO_ID, filename="best_model.pth") 
config_path = hf_hub_download(repo_id=REPO_ID, filename="config.json")

api = TTS(model_path=best_model_path, config_path=config_path).to("cuda" if CUDA else "cpu")

# pre-download voice conversion models
for model in VOICE_CONVERSION_MODELS.values():
    api.load_vc_model_by_name(model, gpu=CUDA)

def tts(text: str, speaker_wav: str = None, voice_cv_model: str = 'freevc24', split_sentences: bool = True):
    # replace oov characters
    text = text.replace("\n", ". ")
    text = text.replace("(", ",")
    text = text.replace(")", ",")
    text = text.replace('"', ",")
    text = text.replace(";", ",")
    text = text.replace("-", " ")

    with tempfile.NamedTemporaryFile(suffix = ".wav", delete = False) as fp:
        if speaker_wav:
            api.load_vc_model_by_name(VOICE_CONVERSION_MODELS[voice_cv_model], gpu=CUDA)
            api.tts_with_vc_to_file(text, speaker_wav=speaker_wav, file_path=fp.name, split_sentences=split_sentences)
        else:
            api.tts_to_file(text, file_path=fp.name, split_sentences=split_sentences)

    return fp.name 

iface = gr.Interface(
    fn=tts, 
    inputs=my_inputs, 
    outputs=my_outputs, 
    title=my_title, 
    description=my_description, 
    examples=my_examples,
    cache_examples=True
)
iface.launch()