TextToSpeech

Sleeping

File size: 4,530 Bytes

9e41260
889e1fa
 
 
 
d011be0
889e1fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26ce970
d011be0
889e1fa
26ce970
3ae4583
12ef859
26ce970
 
 
 
 
3ae4583
12ef859
889e1fa
 
 
26ce970
3ae4583
12ef859
3ae4583
889e1fa
 
 
 
 
 
 
 
 
 
 
 
 
12ef859
 
 
889e1fa
12ef859
 
 
 
d011be0
 
12ef859
d011be0
7bf74d5
12ef859
889e1fa
 
 
 
 
 
 
 
 
 
d011be0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fef1314
 
d011be0
 
fef1314
 
 
d011be0
 
 
 
 
 
fef1314
889e1fa

import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from moviepy.editor import AudioFileClip

# 获取所有可用的语音
async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

# 文字转语音功能
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, gr.Warning("Please enter the text to convert.")
    if not voice:
        return None, gr.Warning("Please select a voice.")
    
    voice_short_name = voice.split(" - ")[0]
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path, None

# 生成SRT文件，支持两行字幕
def generate_srt(words, audio_duration, srt_path, fps=24):
    with open(srt_path, 'w', encoding='utf-8') as srt_file:
        segment_duration = audio_duration / (len(words) // 2)  # Average duration for each two words
        current_time = 0
        
        for i in range(0, len(words), 2):
            # Get two words for each subtitle entry
            line1 = words[i]
            line2 = words[i + 1] if (i + 1) < len(words) else ""
            
            start_time = current_time
            end_time = start_time + segment_duration
            
            start_time_str = format_srt_time(start_time)
            end_time_str = format_srt_time(end_time)
            srt_file.write(f"{i // 2 + 1}\n{start_time_str} --> {end_time_str}\n{line1} {line2}\n\n")
            
            current_time += segment_duration  # Update current time

    return srt_path

def format_srt_time(seconds):
    millis = int((seconds - int(seconds)) * 1000)
    seconds = int(seconds)
    minutes = seconds // 60
    hours = minutes // 60
    minutes %= 60
    seconds %= 60
    return f"{hours:02}:{minutes:02}:{seconds:02},{millis:03}"

# 文字转音频和SRT功能
async def text_to_audio_and_srt(text, voice, rate, pitch):
    audio_path, warning = await text_to_speech(text, voice, rate, pitch)
    if warning:
        return None, None, warning

    audio_clip = AudioFileClip(audio_path)
    audio_duration = audio_clip.duration
    
    # Generate SRT file based on the entire text
    base_name = os.path.splitext(audio_path)[0]
    srt_path = f"{base_name}_subtitle.srt"
    words = text.split()
    generate_srt(words, audio_duration, srt_path)

    return audio_path, srt_path, None

# Gradio接口函数
def tts_interface(text, voice, rate, pitch):
    audio_path, srt_path, warning = asyncio.run(text_to_audio_and_srt(text, voice, rate, pitch))
    return audio_path, srt_path, warning

# 创建Gradio应用
async def create_demo():
    voices = await get_voices()
    
    with gr.Blocks() as demo:
        gr.Markdown(
            """
            <h1 style="text-align: center; color: #333;">Text to Speech with Subtitles</h1>
            <p style="text-align: center; color: #555;">Convert your text to natural-sounding speech and generate subtitles (SRT) for your audio.</p>
            """, 
            elem_id="header"
        )

        with gr.Row():
            with gr.Column():
                text_input = gr.Textbox(label="Input Text", lines=5, placeholder="Enter text here...")
                voice_dropdown = gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value="")
                rate_slider = gr.Slider(minimum=-50, maximum=50, value=0, label="Rate Adjustment (%)", step=1)
                pitch_slider = gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1)
                
                generate_button = gr.Button("Generate Audio and Subtitles", variant="primary")

            with gr.Column():
                output_audio = gr.Audio(label="Generated Audio", type="filepath")
                output_srt = gr.File(label="Generated SRT", file_count="single")
                warning_msg = gr.Markdown(label="Warning", visible=False)

        generate_button.click(
            fn=tts_interface,
            inputs=[text_input, voice_dropdown, rate_slider, pitch_slider],
            outputs=[output_audio, output_srt, warning_msg]
        )

    return demo

# 运行应用
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()