import streamlit as st import torch from TTS.api import TTS import os import tempfile os.environ["COQUI_TOS_AGREED"] = "1" device = "cuda" if torch.cuda.is_available() else "cpu" # Initialize TTS model @st.cache_resource def load_tts_model(): return TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device) tts = load_tts_model() def clone(text, audio_file, language, speaking_rate, pitch, volume, emotion, sample_rate, temperature, seed): if seed is not None: torch.manual_seed(seed) with tempfile.NamedTemporaryFile(delete=False, suffix=".wav") as temp_audio: temp_audio_path = temp_audio.name tts.tts_to_file( text=text, speaker_wav=audio_file, language=language, file_path=temp_audio_path ) return temp_audio_path st.title('Advanced Voice Clone') st.write('Customize your voice cloning experience with various parameters.') text = st.text_area('Text') audio_file = st.file_uploader('Voice reference audio file', type=['wav', 'mp3']) language = st.selectbox('Language', ["en", "es", "fr", "de", "it"], index=0) speaking_rate = st.slider('Speaking Rate', 0.5, 2.0, 1.0) pitch = st.slider('Pitch Adjustment', -10, 10, 0) volume = st.slider('Volume', 0.1, 2.0, 1.0) emotion = st.selectbox('Emotion', ["neutral", "happy", "sad", "angry"], index=0) sample_rate = st.selectbox('Sample Rate', [22050, 24000, 44100, 48000], index=1) temperature = st.slider('Temperature', 0.1, 1.0, 0.8) seed = st.number_input('Seed (optional)', value=None) if st.button('Generate'): if text and audio_file: with st.spinner('Generating audio...'): output_path = clone(text, audio_file, language, speaking_rate, pitch, volume, emotion, sample_rate, temperature, seed) st.audio(output_path) else: st.warning('Please provide both text and a voice reference audio file.') # Clean up temporary files for file in os.listdir(): if file.endswith('.wav') and file.startswith('tmp'): os.remove(file)