import streamlit as st from TTS.api import TTS import tempfile import os from pydub import AudioSegment # Initialize the TTS model tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2") # Streamlit UI st.title("XTTS v2 Speech Synthesis") st.write("Enter text below to generate speech.") # Sidebar for reference voice st.sidebar.title("Voice Cloning") reference_audio = st.sidebar.file_uploader("Upload a reference audio (any format)", type=["wav", "mp3", "ogg", "flac", "m4a"]) # Function to convert audio to WAV format def convert_to_wav(audio_file): temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") audio = AudioSegment.from_file(audio_file) audio.export(temp_audio.name, format="wav") return temp_audio.name if reference_audio: ref_audio_path = convert_to_wav(reference_audio) else: ref_audio_path = None text_input = st.text_area("Text to convert to speech:", "Hello, this is an AI-generated voice.") if st.button("Generate Speech"): with st.spinner("Generating audio..."): # Define output path output_path = "output.wav" # Generate speech using XTTS v2 tts.tts_to_file( text=text_input, file_path=output_path, speaker_wav=ref_audio_path if ref_audio_path else None, language="en" ) # Play the audio in the Streamlit app st.audio(output_path, format="audio/wav") st.success("Speech generated successfully!") # Clean up temporary files if ref_audio_path: os.remove(ref_audio_path)