Spaces:

hon9kon9ize
/

Cantonese-TTS-playground

Running

File size: 4,473 Bytes

c005bf8

import gradio as gr
import asyncio
import base64
import os
from tts import voices, tts, get_task_result, Voice
import tempfile

def generate_speech(text, voice_name, custom_audio=None, custom_prompt_text=None):
    """Generate speech from text using the selected voice or custom voice"""
    if not text.strip():
        return None, "Please enter some text"
    
    output_file = "temp_output.wav"
    
    # Handle custom voice upload
    if custom_audio is not None and custom_prompt_text and custom_prompt_text.strip():
        # Create a temporary Voice object with the uploaded audio
        temp_audio_path = custom_audio
        voice = {
            "name": "Custom Voice",
            "promptText": custom_prompt_text,
            "promptAudio": temp_audio_path
        }
    else:
        # Use predefined voice
        voice = voices[voice_name]
    
    async def process_tts():
        try:
            task_id = await tts(text, voice)
            
            while True:
                result = await get_task_result(task_id)
                if result['status'] != 'PENDING':
                    break
                await asyncio.sleep(1)
            
            if result['status'] == 'SUCCESS':
                audio_data = result['audio_url']
                if ',' in audio_data:
                    audio_data = audio_data.split(',')[1]
                
                with open(output_file, 'wb') as f:
                    f.write(base64.b64decode(audio_data))
                return output_file, f"Successfully generated audio using {voice['name']}"
            else:
                return None, f"TTS generation failed: {result['message']}"
        except Exception as e:
            return None, f"Error: {str(e)}"
    
    return asyncio.run(process_tts())

# Create a dictionary of voice names for the dropdown
voice_options = {k: v["name"] for k, v in voices.items()}

# Create the Gradio interface
with gr.Blocks(title="Cantonese Text-to-Speech") as demo:
    gr.Markdown("# Cantonese Text-to-Speech Demo")
    gr.Markdown("Enter text in Cantonese and select a voice to generate speech.")
    
    with gr.Row():
        with gr.Column(scale=2):
            text_input = gr.Textbox(
                placeholder="輸入廣東話文字...",
                label="Text to convert",
                lines=5
            )
            
            with gr.Group():
                gr.Markdown("### Choose a voice option")
                voice_dropdown = gr.Dropdown(
                    choices=list(voice_options.keys()),
                    value=list(voice_options.keys())[0],
                    label="Select Predefined Voice",
                    info="Choose a voice for synthesis"
                )
                
                # Display the actual voice name based on the selection
                voice_name_display = gr.Markdown(value=f"Selected Voice: {voice_options[list(voice_options.keys())[0]]}")
        
        with gr.Column(scale=2):
            with gr.Group():
                gr.Markdown("### Or upload your own voice (optional)")
                custom_audio = gr.Audio(
                    label="Upload Voice Sample (WAV format)",
                    type="filepath",
                    format="wav"
                )
                custom_prompt_text = gr.Textbox(
                    placeholder="Enter the exact transcription of the uploaded audio...",
                    label="Transcription of Uploaded Audio (required if using custom voice)",
                    lines=2
                )
                gr.Markdown("*Note: The custom voice sample should be clear with minimal background noise.*")
            
            generate_btn = gr.Button("Generate Speech", variant="primary")
        
        with gr.Column(scale=3):
            audio_output = gr.Audio(label="Generated Speech", type="filepath")
            status_text = gr.Markdown("Ready to generate speech")
    
    # Update the voice name display when dropdown changes
    voice_dropdown.change(
        fn=lambda x: f"Selected Voice: {voice_options[x]}",
        inputs=voice_dropdown,
        outputs=voice_name_display
    )
    
    # Generate speech when button is clicked
    generate_btn.click(
        fn=generate_speech,
        inputs=[text_input, voice_dropdown, custom_audio, custom_prompt_text],
        outputs=[audio_output, status_text],
        concurrency_limit=1
    )

if __name__ == "__main__":
    demo.launch()