File size: 3,921 Bytes
f654d12
12bfd03
 
 
 
 
 
 
 
 
 
 
 
f654d12
12bfd03
 
73c1b13
 
12bfd03
 
4905c07
 
12bfd03
 
 
 
 
 
 
73c1b13
12bfd03
 
 
 
73c1b13
 
12bfd03
73c1b13
 
 
 
12bfd03
 
 
 
 
 
 
4905c07
12bfd03
4905c07
12bfd03
 
 
 
4905c07
12bfd03
4905c07
12bfd03
 
4905c07
12bfd03
4905c07
12bfd03
4905c07
12bfd03
 
4905c07
12bfd03
 
4905c07
 
12bfd03
 
4905c07
12bfd03
 
 
 
 
 
 
4905c07
 
12bfd03
4905c07
 
12bfd03
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
import spaces
import random

import gradio as gr
from css.utils import *


# 定制语音生成
def custom():

    def random_seed():
        return random.randint(1, 100000000)

    @spaces.GPU
    def generate_audio(_recorded_audio, _prompt_input_textbox, _language_radio,
                       _synthetic_input_textbox, _seed):
        import time
        t1 = time.time()
        print(_recorded_audio, _prompt_input_textbox, _language_radio, _synthetic_input_textbox, _seed)
        if _synthetic_input_textbox == '':
            # gr.Warning('合成文本为空,您是否忘记输入合成文本?')
            gr.Warning('The synthesis text is empty, did you forget to input the synthesis text?')
            return (target_sr, default_data)
        set_all_random_seed(_seed)
        if use_instruct(_synthetic_input_textbox):
            model = cosyvoice_instruct
        else:
            model = cosyvoice
        prompt_speech_16k = postprocess(load_wav(_recorded_audio, prompt_sr))
        t2 = time.time()
        if _language_radio == 'cross' or _prompt_input_textbox == '':
            output = model.inference_cross_lingual(_synthetic_input_textbox, prompt_speech_16k)
        else:
            output = model.inference_zero_shot(_synthetic_input_textbox, _prompt_input_textbox, prompt_speech_16k)

        t3 = time.time()
        audio_data = postprocess(output['tts_speech']).numpy().flatten()
        t4 = time.time()
        print(f'load and preprocess time: {t2-t1}s')
        print(f'inference time: {t3-t2}s')
        print(f'postprocess time: {t4-t3}s')
        return (target_sr, audio_data)

    with gr.Column():
        with gr.Row():
            with gr.Column(scale=1, min_width=400):
                with gr.Group():
                    recorded_audio = gr.Audio(sources=['microphone'],
                                              label="Record Audio File",
                                              type='filepath')
                    gr.Text("Please click to record and read the text on the right (Chinese or English) to complete the input",
                            max_lines=1,
                            container=False,
                            interactive=False)
            with gr.Column(scale=10):
                prompt_input_textbox = gr.Textbox(label="Input Text for Recording")
                gr.Examples(
                    label="Example Recording Texts",
                    examples=example_prompt_text,
                    inputs=[prompt_input_textbox])
    
    with gr.Column():
        language_radio = gr.Radio(choices=[('Same Language', 'same'), ('Cross Language', 'cross')],
                                  value='same',
                                  label="Input Synthesis Text")
        synthetic_input_textbox = gr.Textbox(show_label=False)
        gr.Examples(
            label="Example Texts",
            examples=example_tts_text,
            inputs=[synthetic_input_textbox])
    
    with gr.Accordion(label="Random Seed"):
        with gr.Row():
            with gr.Column(scale=1, min_width=180):
                seed_button = gr.Button(value="\U0001F3B2 Shuffle Randomly",
                                        elem_classes="full-height")
            with gr.Column(scale=10):
                seed = gr.Number(show_label=False,
                                 value=0,
                                 container=False,
                                 elem_classes="full-height")
    with gr.Column():
        generate_button = gr.Button("Generate Audio", variant="primary", size="lg")
    
    with gr.Column():
        output_audio = gr.Audio(label="Synthesized Audio")
    
    seed_button.click(fn=random_seed, outputs=[seed])
    generate_button.click(
        fn=generate_audio,
        inputs=[recorded_audio, prompt_input_textbox, language_radio, synthetic_input_textbox, seed],
        outputs=[output_audio])