import os import gradio as gr import torch import zipfile from TTS.api import TTS from pydub import AudioSegment # Constants AUDIO_FORMATS = [".wav", ".mp3", ".flac", ".mp4"] LANGUAGES = ["en", "es", "fr", "de", "it", "pt", "pl", "tr", "ru", "nl", "cs", "ar", "zh-cn", "ja", "hu", "ko", "hi"] # Device setup device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu") print(f"Using device: {device}") # TTS model setup os.environ["COQUI_TOS_AGREED"] = "1" MODEL_PATH = "tts_models/multilingual/multi-dataset/xtts_v2" tts = TTS(MODEL_PATH).to(device) def generate_audio(text, language, speed, pitch, volume): # Prepare input input_text = {"text": text, "language": language} tts.prepare_input(input_text) # Generate audio audio = tts.generate_audio(input_text, speed=speed, pitch=pitch, volume=volume) # Save audio audio_path = "output.wav" tts.save_audio(audio_path, audio) # Convert to mp3 audio_segment = AudioSegment.from_wav(audio_path) audio_segment.export(audio_path[:-4] + ".mp3", format="mp3") # Return audio path return audio_path[:-4] + ".mp3" iface = gr.Interface( generate_audio, inputs=["text", "language", "speed", "pitch", "volume"], outputs="audio", audio_output_type="mp3", title="Text-to-Speech", description="Convert text to speech in multiple languages.", allow_flagging=False, cache_examples=False, ) iface.launch()