Spaces:
Running
Running
File size: 3,991 Bytes
9e41260 889e1fa d011be0 2b5d6f0 889e1fa 7ad2a01 889e1fa 7ad2a01 889e1fa 7ad2a01 2b5d6f0 7ad2a01 2b5d6f0 7ad2a01 2b5d6f0 7ad2a01 2b5d6f0 7ad2a01 889e1fa 7ad2a01 2b5d6f0 7ad2a01 2b5d6f0 f1779f5 7ad2a01 2b5d6f0 12ef859 7ad2a01 2b5d6f0 7ad2a01 2b5d6f0 7a5c01c 2b5d6f0 7bf74d5 12ef859 889e1fa 2b5d6f0 889e1fa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
import gradio as gr
import edge_tts
import asyncio
import tempfile
from moviepy.editor import AudioFileClip
import re
async def get_voices():
voices = await edge_tts.list_voices()
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
async def text_to_speech(text, voice, rate, pitch):
if not text.strip():
return None, gr.Warning("Please enter the text to convert.")
if not voice:
return None, gr.Warning("Please select a voice.")
voice_short_name = voice.split(" - ")[0]
rate_str = f"{rate:+d}%"
pitch_str = f"{pitch:+d}Hz"
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path, None
def generate_srt(text, audio_duration, max_words_per_line):
# Eliminate extra spaces and split into words
text = re.sub(r'\s+', ' ', text.strip())
words = text.split()
srt_lines = []
current_line = []
total_words = len(words)
for i, word in enumerate(words):
current_line.append(word)
# Create a line if we reach the max words per line or at the end of the text
if len(current_line) >= max_words_per_line or i == total_words - 1:
line_text = ' '.join(current_line)
# Adjust duration proportionally
duration = audio_duration * (len(current_line) / total_words)
start_time = (sum(len(' '.join(srt_lines[j].split()[2:])) for j in range(len(srt_lines))) / total_words) * audio_duration if srt_lines else 0
end_time = start_time + duration
# Formatting time for SRT
start_time_str = f"{int(start_time // 3600):02}:{int((start_time % 3600) // 60):02}:{int(start_time % 60):02},{int((start_time % 1) * 1000):03}"
end_time_str = f"{int(end_time // 3600):02}:{int((end_time % 3600) // 60):02}:{int(end_time % 60):02},{int((end_time % 1) * 1000):03}"
srt_lines.append(f"{len(srt_lines) + 1}\n{start_time_str} --> {end_time_str}\n{line_text}\n")
current_line = []
return ''.join(srt_lines)
def tts_interface(text, voice, rate, pitch, max_words_per_line):
audio_path, warning = asyncio.run(text_to_speech(text, voice, rate, pitch))
if warning:
return None, None, warning
# Calculate audio duration
audio_duration = AudioFileClip(audio_path).duration # Get duration in seconds
# Generate SRT file
srt_content = generate_srt(text, audio_duration, max_words_per_line)
srt_path = audio_path.replace('.mp3', '_subtitle.srt')
with open(srt_path, 'w') as f:
f.write(srt_content)
return audio_path, srt_path, None
async def create_demo():
voices = await get_voices()
demo = gr.Interface(
fn=tts_interface,
inputs=[
gr.Textbox(label="Input Text", lines=5),
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
gr.Slider(minimum=-50, maximum=50, value=0, label="Rate Adjustment (%)", step=1),
gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1),
gr.Slider(minimum=3, maximum=8, value=5, label="Max Words per Line", step=1),
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.File(label="Generated Subtitle (.srt)"),
gr.Markdown(label="Warning", visible=False)
],
title="Edge TTS Text to Speech with SRT",
description="Convert text to speech and generate synchronized subtitles based on speech rate.",
analytics_enabled=False,
allow_flagging=False,
)
return demo
if __name__ == "__main__":
demo = asyncio.run(create_demo())
demo.launch()
|