Spaces:
Running
Running
File size: 5,144 Bytes
4da00bb 091d27c 4da00bb 091d27c 4da00bb 091d27c 4da00bb 091d27c c54507d 091d27c c4d5b0c c54507d 091d27c c54507d 091d27c c54507d 091d27c c54507d 091d27c c54507d 091d27c 75a0f9c c4d5b0c 75a0f9c 091d27c 75a0f9c c4d5b0c 091d27c c54507d 536587c 091d27c 4da00bb 536587c 091d27c 4da00bb 091d27c 4da00bb 536587c 091d27c 4da00bb 091d27c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 |
import gradio as gr
import edge_tts
import asyncio
import tempfile
import os
from moviepy.editor import concatenate_videoclips, AudioFileClip, ImageClip
from wand.image import Image
from wand.drawing import Drawing
from wand.color import Color
# Function to get available voices
async def get_voices():
voices = await edge_tts.list_voices()
return {f"{v['ShortName']} - {v['Locale']} ({v['Gender']})": v['ShortName'] for v in voices}
# Text-to-Speech function
async def text_to_speech(text, voice, rate, pitch):
if not text.strip():
return None, gr.Warning("Please enter the text to convert.")
if not voice:
return None, gr.Warning("Please select a voice.")
voice_short_name = voice.split(" - ")[0]
rate_str = f"{rate:+d}%"
pitch_str = f"{pitch:+d}Hz"
communicate = edge_tts.Communicate(text, voice_short_name, rate=rate_str, pitch=pitch_str)
with tempfile.NamedTemporaryFile(delete=False, suffix=".mp3") as tmp_file:
tmp_path = tmp_file.name
await communicate.save(tmp_path)
return tmp_path, None
# Text-to-Video function
def text_to_video(text, voice, rate, pitch, video_width, video_height, bg_color, text_color, text_font, text_size):
# Ensure the font file exists, else use a default font
font_path = os.path.abspath(text_font) if os.path.exists(text_font) else "Arial"
# Split text into lines
words = text.split()
lines = []
current_line = ""
for word in words:
if len(current_line) + len(word) + 1 > (video_width // (text_size // 2)):
lines.append(current_line)
current_line = word
else:
current_line = f"{current_line} {word}".strip()
if current_line:
lines.append(current_line)
audio_clips = []
video_clips = []
for line in lines:
audio, warning = asyncio.run(text_to_speech(line, voice, rate, pitch))
if warning:
return None, warning
audio_clip = AudioFileClip(audio)
audio_clips.append(audio_clip)
# Create an image for each line of text
with Drawing() as draw:
draw.font = font_path
draw.font_size = text_size
draw.fill_color = Color(text_color)
with Image(width=video_width, height=video_height, background=Color(bg_color)) as img:
# Draw the text in the center
text_width = draw.get_text_dimensions(line)[0]
y_position = (video_height - text_size) // 2 # Center vertically
draw.text((video_width - text_width) // 2, y_position, line) # Center horizontally
draw(img)
img_path = os.path.join(tempfile.gettempdir(), f"{line}.png")
img.save(filename=img_path)
text_clip = ImageClip(img_path).set_duration(audio_clip.duration).set_audio(audio_clip)
video_clips.append(text_clip)
# Combine all video clips
final_video = concatenate_videoclips(video_clips)
final_video_path = os.path.join(tempfile.gettempdir(), "output_video.mp4")
final_video.write_videofile(final_video_path, fps=24, codec="libx264")
return final_video_path, None
# Gradio interface function
def tts_interface(text, voice, rate, pitch, video_width, video_height, bg_color, text_color, text_font, text_size):
video, warning = text_to_video(text, voice, rate, pitch, video_width, video_height, bg_color, text_color, text_font, text_size)
return None, video, warning
# Create Gradio app
async def create_demo():
voices = await get_voices()
demo = gr.Interface(
fn=tts_interface,
inputs=[
gr.Textbox(label="Input Text", lines=5),
gr.Dropdown(choices=[""] + list(voices.keys()), label="Select Voice", value=""),
gr.Slider(minimum=-50, maximum=50, value=0, label="Rate Adjustment (%)", step=1),
gr.Slider(minimum=-20, maximum=20, value=0, label="Pitch Adjustment (Hz)", step=1),
gr.Slider(minimum=640, maximum=1920, value=1080, label="Video Width", step=10),
gr.Slider(minimum=480, maximum=1080, value=720, label="Video Height", step=10),
gr.ColorPicker(value="#000000", label="Background Color"),
gr.ColorPicker(value="#FFFFFF", label="Text Color"),
gr.Textbox(label="Text Font", value="Arial"), # Default to Arial for testing
gr.Slider(minimum=10, maximum=100, value=24, label="Text Size", step=1)
],
outputs=[
gr.Audio(label="Generated Audio", type="filepath"),
gr.Video(label="Generated Video"),
gr.Markdown(label="Warning", visible=False)
],
title="Edge TTS Text to Speech and Video",
description="Convert text to speech and video using Microsoft Edge TTS. Adjust rate and pitch: 0 is the default value, positive values increase, and negative values decrease.",
analytics_enabled=False,
allow_flagging=False,
)
return demo
# Run the app
if __name__ == "__main__":
demo = asyncio.run(create_demo())
demo.launch(share=True)
|