import streamlit as st import outetts from scipy.io.wavfile import write import tempfile import os from pydub import AudioSegment import sounddevice as sd import wave import numpy as np # Initialize model configuration model_config = outetts.HFModelConfig_v1( model_path="OuteAI/OuteTTS-0.2-500M", language="en" # Supported languages: en, zh, ja, ko ) # Initialize the interface interface = outetts.InterfaceHF(model_version="0.2", cfg=model_config) # Streamlit UI st.title("OuteTTS Speech Synthesis") st.write("Enter text below to generate speech.") # Sidebar for reference voice st.sidebar.title("Voice Cloning") reference_audio = st.sidebar.file_uploader("Upload a reference audio (any format)", type=["wav", "mp3", "ogg", "flac", "m4a"]) transcript = st.sidebar.text_area("Transcription of the reference audio") # Function to convert audio to WAV format def convert_to_wav(audio_file): temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") audio = AudioSegment.from_file(audio_file) audio.export(temp_audio.name, format="wav") return temp_audio.name if reference_audio and transcript: ref_audio_path = convert_to_wav(reference_audio) # Create speaker profile speaker = interface.create_speaker(ref_audio_path, transcript) # Save the speaker profile interface.save_speaker(speaker, "speaker.json") else: speaker = None # Recording functionality def record_audio(duration=5, samplerate=44100): st.sidebar.write("Recording...") recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype=np.int16) sd.wait() temp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name with wave.open(temp_audio_path, "wb") as wf: wf.setnchannels(1) wf.setsampwidth(2) wf.setframerate(samplerate) wf.writeframes(recording.tobytes()) return temp_audio_path if not speaker: st.sidebar.write("Or record your voice below:") if st.sidebar.button("Record Voice"): ref_audio_path = record_audio() st.sidebar.success("Recording complete!") transcript = st.sidebar.text_area("Transcription of the recorded audio") if transcript: # Create speaker profile from recorded audio speaker = interface.create_speaker(ref_audio_path, transcript) # Save the speaker profile interface.save_speaker(speaker, "speaker.json") text_input = st.text_area("Text to convert to speech:", "Hello, this is an AI-generated voice.") if st.button("Generate Speech"): with st.spinner("Generating audio..."): # Generate speech with or without the speaker profile output = interface.generate( text=text_input, temperature=0.1, repetition_penalty=1.1, max_length=4096, speaker=speaker ) # Save the synthesized speech to a file output_path = "output.wav" output.save(output_path) # Play the audio in the Streamlit app st.audio(output_path, format="audio/wav") st.success("Speech generated successfully!") # Clean up temporary files if reference_audio: os.remove(ref_audio_path)