File size: 3,991 Bytes
9e41260
889e1fa
 
 
d011be0
2b5d6f0
889e1fa
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7ad2a01
889e1fa
 
 
7ad2a01
889e1fa
 
7ad2a01
 
 
2b5d6f0
 
 
 
7ad2a01
2b5d6f0
 
7ad2a01
2b5d6f0
 
7ad2a01
 
 
2b5d6f0
7ad2a01
889e1fa
7ad2a01
2b5d6f0
 
 
 
 
7ad2a01
2b5d6f0
f1779f5
7ad2a01
2b5d6f0
12ef859
 
 
7ad2a01
 
 
2b5d6f0
7ad2a01
2b5d6f0
7a5c01c
2b5d6f0
 
7bf74d5
12ef859
889e1fa
 
 
 
2b5d6f0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
889e1fa
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import gradio as gr
import edge_tts
import asyncio
import tempfile
from moviepy.editor import AudioFileClip
import re

async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, gr.Warning("Please enter the text to convert.")
    if not voice:
        return None, gr.Warning("Please select a voice.")
    
    voice_short_name = voice.split(" - ")[0]
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
    
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    
    return tmp_path, None

def generate_srt(text, audio_duration, max_words_per_line):
    # Eliminate extra spaces and split into words
    text = re.sub(r'\s+', ' ', text.strip())
    words = text.split()
    srt_lines = []
    current_line = []
    total_words = len(words)
    
    for i, word in enumerate(words):
        current_line.append(word)
        # Create a line if we reach the max words per line or at the end of the text
        if len(current_line) >= max_words_per_line or i == total_words - 1:
            line_text = ' '.join(current_line)
            # Adjust duration proportionally
            duration = audio_duration * (len(current_line) / total_words)
            start_time = (sum(len(' '.join(srt_lines[j].split()[2:])) for j in range(len(srt_lines))) / total_words) * audio_duration if srt_lines else 0
            
            end_time = start_time + duration
            
            # Formatting time for SRT
            start_time_str = f"{int(start_time // 3600):02}:{int((start_time % 3600) // 60):02}:{int(start_time % 60):02},{int((start_time % 1) * 1000):03}"
            end_time_str = f"{int(end_time // 3600):02}:{int((end_time % 3600) // 60):02}:{int(end_time % 60):02},{int((end_time % 1) * 1000):03}"
            
            srt_lines.append(f"{len(srt_lines) + 1}\n{start_time_str} --> {end_time_str}\n{line_text}\n")
            current_line = []
    
    return ''.join(srt_lines)

def tts_interface(text, voice, rate, pitch, max_words_per_line):
    audio_path, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
    if warning:
        return None, None, warning
    
    # Calculate audio duration
    audio_duration = AudioFileClip(audio_path).duration  # Get duration in seconds
    
    # Generate SRT file
    srt_content = generate_srt(text, audio_duration, max_words_per_line)
    srt_path = audio_path.replace('.mp3', '_subtitle.srt')
    
    with open(srt_path, 'w') as f:
        f.write(srt_content)

    return audio_path, srt_path, None

async def create_demo():
    voices = await get_voices()
    
    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1),
            gr.Slider(minimum=3, maximum=8, value=5, label="Max Words per Line", step=1),
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.File(label="Generated Subtitle (.srt)"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Edge TTS Text to Speech with SRT",
        description="Convert text to speech and generate synchronized subtitles based on speech rate.",
        analytics_enabled=False,
        allow_flagging=False,
    )
    
    return demo

if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch()