File size: 2,818 Bytes
0f20c1d
4110922
d33f32d
 
4110922
6afc51e
3a4510f
 
4110922
0f20c1d
3a4510f
 
 
 
 
 
 
0f20c1d
4110922
 
3a4510f
4af5dfa
 
 
3a4510f
4af5dfa
4110922
0a5aead
 
6ba8d0a
0a5aead
486cbec
 
 
 
 
 
d49cd75
0a5aead
3a4510f
dd46461
3a4510f
4049f7e
0a5aead
4af5dfa
0f20c1d
1b3c5cb
486cbec
5b06c99
4c6fefc
486cbec
 
cf0681a
8af32ca
16b24b7
5b06c99
 
4af5dfa
 
c5ee403
 
 
486cbec
6ba8d0a
5b06c99
 
c5ee403
 
486cbec
4c6fefc
c5ee403
4af5dfa
 
c5ee403
4af5dfa
 
0f20c1d
 
4af5dfa
3a4510f
 
1b3c5cb
3a4510f
 
1b3c5cb
 
 
4af5dfa
 
45fac78
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
###################################### imports ######################################
import torch
from TTS.api import TTS
import gradio as gr
import os
import spaces
import yaml


###################################### utilities ######################################
def get_config():
    # get config path
    config_path = os.environ["CONFIG_PATH"]
    # Parse the YAML file
    with open(config_path, 'r') as file:
        config = yaml.safe_load(file)
    
    return config


def init_TTS(config):
    # Get device
    device = "cuda" if torch.cuda.is_available() else "cpu"
    # Initialize the TTS model
    tts = TTS(config['inference']['model']).to(device)
    return tts


@spaces.GPU
def generate_speech(voice_choice, markdown, microphone, text):
    # Generate speech using the provided text, speaker voice, and language

    if voice_choice=="Record":
        speaker = microphone
    
    else:
        speaker = config['inference']['speaker_wav']
    
    tts.tts_to_file(text=text,
                    file_path=config['inference']['file_path'],
                    speaker_wav=speaker,
                    language=config['inference']['language'])
    return config['inference']['file_path']


###################################### main ######################################
def UI(config):

    # gradio elements
    voice_choice = gr.Radio(label="Record or use a predefined voice.", 
                            choices=["Record", "Predefined (Nancy)"], 
                            value="Record")
    markdown = gr.Markdown("""If recording, speak loud and clearly. Recommended speaking track 
                              '*printing, in the only sense with which we are at present concerned, 
                              differs from, most if not all, the arts and crafts in the exhibition.*'""")
    microphone = gr.Audio(label="Audio", sources="microphone", type="filepath", elem_id='audio')
    enter_text = gr.Textbox(label="Enter your text")
    
    # Create the Gradio interface
    demo = gr.Interface(
        fn=generate_speech,
        inputs=[
            voice_choice,
            markdown,
            microphone,
            enter_text
        ],
        outputs="audio",
        title="Voice cloning and Synthesis with Coqui-XTTS",
        description="Clone your voice and Synthesize speech using predefined target voice and language. It takes a 10-20 seconds to download the model, so wait to record until the app is *Running on Zero* to begin."
    )

    # Launch the interface
    demo.launch()
    return 0


###################################### Execute ######################################
if __name__ == "__main__":
    # Get config
    config = get_config()
    
    # initialize TTS
    tts = init_TTS(config)
    
    # run program
    UI(config)