File size: 4,473 Bytes
c005bf8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import gradio as gr
import asyncio
import base64
import os
from tts import voices, tts, get_task_result, Voice
import tempfile

def generate_speech(text, voice_name, custom_audio=None, custom_prompt_text=None):
    """Generate speech from text using the selected voice or custom voice"""
    if not text.strip():
        return None, "Please enter some text"
    
    output_file = "temp_output.wav"
    
    # Handle custom voice upload
    if custom_audio is not None and custom_prompt_text and custom_prompt_text.strip():
        # Create a temporary Voice object with the uploaded audio
        temp_audio_path = custom_audio
        voice = {
            "name": "Custom Voice",
            "promptText": custom_prompt_text,
            "promptAudio": temp_audio_path
        }
    else:
        # Use predefined voice
        voice = voices[voice_name]
    
    async def process_tts():
        try:
            task_id = await tts(text, voice)
            
            while True:
                result = await get_task_result(task_id)
                if result['status'] != 'PENDING':
                    break
                await asyncio.sleep(1)
            
            if result['status'] == 'SUCCESS':
                audio_data = result['audio_url']
                if ',' in audio_data:
                    audio_data = audio_data.split(',')[1]
                
                with open(output_file, 'wb') as f:
                    f.write(base64.b64decode(audio_data))
                return output_file, f"Successfully generated audio using {voice['name']}"
            else:
                return None, f"TTS generation failed: {result['message']}"
        except Exception as e:
            return None, f"Error: {str(e)}"
    
    return asyncio.run(process_tts())

# Create a dictionary of voice names for the dropdown
voice_options = {k: v["name"] for k, v in voices.items()}

# Create the Gradio interface
with gr.Blocks(title="Cantonese Text-to-Speech") as demo:
    gr.Markdown("# Cantonese Text-to-Speech Demo")
    gr.Markdown("Enter text in Cantonese and select a voice to generate speech.")
    
    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                placeholder="輸入廣東話文字...",
                label="Text to convert",
                lines=5
            )
            
            with gr.Group():
                gr.Markdown("### Choose a voice option")
                voice_dropdown = gr.Dropdown(
                    choices=list(voice_options.keys()),
                    value=list(voice_options.keys())[0],
                    label="Select Predefined Voice",
                    info="Choose a voice for synthesis"
                )
                
                # Display the actual voice name based on the selection
                voice_name_display = gr.Markdown(value=f"Selected Voice: {voice_options[list(voice_options.keys())[0]]}")
        
        with gr.Column(scale=2):
            with gr.Group():
                gr.Markdown("### Or upload your own voice (optional)")
                custom_audio = gr.Audio(
                    label="Upload Voice Sample (WAV format)",
                    type="filepath",
                    format="wav"
                )
                custom_prompt_text = gr.Textbox(
                    placeholder="Enter the exact transcription of the uploaded audio...",
                    label="Transcription of Uploaded Audio (required if using custom voice)",
                    lines=2
                )
                gr.Markdown("*Note: The custom voice sample should be clear with minimal background noise.*")
            
            generate_btn = gr.Button("Generate Speech", variant="primary")
        
        with gr.Column(scale=3):
            audio_output = gr.Audio(label="Generated Speech", type="filepath")
            status_text = gr.Markdown("Ready to generate speech")
    
    # Update the voice name display when dropdown changes
    voice_dropdown.change(
        fn=lambda x: f"Selected Voice: {voice_options[x]}",
        inputs=voice_dropdown,
        outputs=voice_name_display
    )
    
    # Generate speech when button is clicked
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, custom_audio, custom_prompt_text],
        outputs=[audio_output, status_text],
        concurrency_limit=1
    )

if __name__ == "__main__":
    demo.launch()