Spaces:
Running
Running
import streamlit as st | |
from TTS.api import TTS | |
import tempfile | |
import os | |
from pydub import AudioSegment | |
# Initialize the TTS model | |
tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2") | |
# Streamlit UI | |
st.title("XTTS v2 Speech Synthesis") | |
st.write("Enter text below to generate speech.") | |
# Sidebar for reference voice | |
st.sidebar.title("Voice Cloning") | |
reference_audio = st.sidebar.file_uploader("Upload a reference audio (any format)", type=["wav", "mp3", "ogg", "flac", "m4a"]) | |
# Function to convert audio to WAV format | |
def convert_to_wav(audio_file): | |
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") | |
audio = AudioSegment.from_file(audio_file) | |
audio.export(temp_audio.name, format="wav") | |
return temp_audio.name | |
if reference_audio: | |
ref_audio_path = convert_to_wav(reference_audio) | |
else: | |
ref_audio_path = None | |
text_input = st.text_area("Text to convert to speech:", "Hello, this is an AI-generated voice.") | |
if st.button("Generate Speech"): | |
with st.spinner("Generating audio..."): | |
# Define output path | |
output_path = "output.wav" | |
# Generate speech using XTTS v2 | |
tts.tts_to_file( | |
text=text_input, | |
file_path=output_path, | |
speaker_wav=ref_audio_path if ref_audio_path else None, | |
language="en" | |
) | |
# Play the audio in the Streamlit app | |
st.audio(output_path, format="audio/wav") | |
st.success("Speech generated successfully!") | |
# Clean up temporary files | |
if ref_audio_path: | |
os.remove(ref_audio_path) | |