TextToSpeech

Running

File size: 5,144 Bytes

4da00bb
 
 
 
 
091d27c
4da00bb
 
 
 
091d27c
4da00bb
 
 
 
091d27c
4da00bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
091d27c
c54507d
091d27c
c4d5b0c
c54507d
091d27c
c54507d
 
 
 
 
091d27c
c54507d
 
 
 
 
091d27c
 
c54507d
 
 
091d27c
 
 
c54507d
 
 
 
 
091d27c
75a0f9c
 
 
c4d5b0c
75a0f9c
091d27c
 
 
 
 
 
75a0f9c
 
 
c4d5b0c
091d27c
c54507d
 
 
 
536587c
091d27c
4da00bb
 
 
536587c
091d27c
4da00bb
 
 
 
 
 
 
 
 
 
 
 
 
 
091d27c
4da00bb
 
 
 
 
 
 
 
 
 
 
 
 
 
536587c
091d27c
4da00bb
 
091d27c

import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from moviepy.editor import concatenate_videoclips, AudioFileClip, ImageClip
from wand.image import Image
from wand.drawing import Drawing
from wand.color import Color

# Function to get available voices
async def get_voices():
    voices = await edge_tts.list_voices()
    return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}

# Text-to-Speech function
async def text_to_speech(text, voice, rate, pitch):
    if not text.strip():
        return None, gr.Warning("Please enter the text to convert.")
    if not voice:
        return None, gr.Warning("Please select a voice.")
    
    voice_short_name = voice.split(" - ")[0]
    rate_str = f"{rate:+d}%"
    pitch_str = f"{pitch:+d}Hz"
    communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
    with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
        tmp_path = tmp_file.name
        await communicate.save(tmp_path)
    return tmp_path, None

# Text-to-Video function
def text_to_video(text, voice, rate, pitch, video_width, video_height, bg_color, text_color, text_font, text_size):
    # Ensure the font file exists, else use a default font
    font_path = os.path.abspath(text_font) if os.path.exists(text_font) else "Arial"

    # Split text into lines
    words = text.split()
    lines = []
    current_line = ""
    
    for word in words:
        if len(current_line) + len(word) + 1 > (video_width // (text_size // 2)):
            lines.append(current_line)
            current_line = word
        else:
            current_line = f"{current_line} {word}".strip()
    
    if current_line:
        lines.append(current_line)

    audio_clips = []
    video_clips = []

    for line in lines:
        audio, warning = asyncio.run(text_to_speech(line, voice, rate, pitch))
        if warning:
            return None, warning
        audio_clip = AudioFileClip(audio)
        audio_clips.append(audio_clip)

        # Create an image for each line of text
        with Drawing() as draw:
            draw.font = font_path
            draw.font_size = text_size
            draw.fill_color = Color(text_color)
            with Image(width=video_width, height=video_height, background=Color(bg_color)) as img:
                # Draw the text in the center
                text_width = draw.get_text_dimensions(line)[0]
                y_position = (video_height - text_size) // 2  # Center vertically
                draw.text((video_width - text_width) // 2, y_position, line)  # Center horizontally
                draw(img)
                img_path = os.path.join(tempfile.gettempdir(), f"{line}.png")
                img.save(filename=img_path)
                text_clip = ImageClip(img_path).set_duration(audio_clip.duration).set_audio(audio_clip)
                video_clips.append(text_clip)

    # Combine all video clips
    final_video = concatenate_videoclips(video_clips)
    final_video_path = os.path.join(tempfile.gettempdir(), "output_video.mp4")
    final_video.write_videofile(final_video_path, fps=24, codec="libx264")
    return final_video_path, None

# Gradio interface function
def tts_interface(text, voice, rate, pitch, video_width, video_height, bg_color, text_color, text_font, text_size):
    video, warning = text_to_video(text, voice, rate, pitch, video_width, video_height, bg_color, text_color, text_font, text_size)
    return None, video, warning

# Create Gradio app
async def create_demo():
    voices = await get_voices()
    
    demo = gr.Interface(
        fn=tts_interface,
        inputs=[
            gr.Textbox(label="Input Text", lines=5),
            gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
            gr.Slider(minimum=-50, maximum=50, value=0, label="Rate Adjustment (%)", step=1),
            gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1),
            gr.Slider(minimum=640, maximum=1920, value=1080, label="Video Width", step=10),
            gr.Slider(minimum=480, maximum=1080, value=720, label="Video Height", step=10),
            gr.ColorPicker(value="#000000", label="Background Color"),
            gr.ColorPicker(value="#FFFFFF", label="Text Color"),
            gr.Textbox(label="Text Font", value="Arial"),  # Default to Arial for testing
            gr.Slider(minimum=10, maximum=100, value=24, label="Text Size", step=1)
        ],
        outputs=[
            gr.Audio(label="Generated Audio", type="filepath"),
            gr.Video(label="Generated Video"),
            gr.Markdown(label="Warning", visible=False)
        ],
        title="Edge TTS Text to Speech and Video",
        description="Convert text to speech and video using Microsoft Edge TTS. Adjust rate and pitch: 0 is the default value, positive values increase, and negative values decrease.",
        analytics_enabled=False,
        allow_flagging=False,
    )
    
    return demo

# Run the app
if __name__ == "__main__":
    demo = asyncio.run(create_demo())
    demo.launch(share=True)