Spaces:
Running
Running
import streamlit as st | |
import outetts | |
from scipy.io.wavfile import write | |
import tempfile | |
import os | |
from pydub import AudioSegment | |
import sounddevice as sd | |
import wave | |
import numpy as np | |
# Initialize model configuration | |
model_config = outetts.HFModelConfig_v1( | |
model_path="OuteAI/OuteTTS-0.2-500M", | |
language="en" # Supported languages: en, zh, ja, ko | |
) | |
# Initialize the interface | |
interface = outetts.InterfaceHF(model_version="0.2", cfg=model_config) | |
# Streamlit UI | |
st.title("OuteTTS Speech Synthesis") | |
st.write("Enter text below to generate speech.") | |
# Sidebar for reference voice | |
st.sidebar.title("Voice Cloning") | |
reference_audio = st.sidebar.file_uploader("Upload a reference audio (any format)", type=["wav", "mp3", "ogg", "flac", "m4a"]) | |
transcript = st.sidebar.text_area("Transcription of the reference audio") | |
# Function to convert audio to WAV format | |
def convert_to_wav(audio_file): | |
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") | |
audio = AudioSegment.from_file(audio_file) | |
audio.export(temp_audio.name, format="wav") | |
return temp_audio.name | |
if reference_audio and transcript: | |
ref_audio_path = convert_to_wav(reference_audio) | |
# Create speaker profile | |
speaker = interface.create_speaker(ref_audio_path, transcript) | |
# Save the speaker profile | |
interface.save_speaker(speaker, "speaker.json") | |
else: | |
speaker = None | |
# Recording functionality | |
def record_audio(duration=5, samplerate=44100): | |
st.sidebar.write("Recording...") | |
recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype=np.int16) | |
sd.wait() | |
temp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name | |
with wave.open(temp_audio_path, "wb") as wf: | |
wf.setnchannels(1) | |
wf.setsampwidth(2) | |
wf.setframerate(samplerate) | |
wf.writeframes(recording.tobytes()) | |
return temp_audio_path | |
if not speaker: | |
st.sidebar.write("Or record your voice below:") | |
if st.sidebar.button("Record Voice"): | |
ref_audio_path = record_audio() | |
st.sidebar.success("Recording complete!") | |
transcript = st.sidebar.text_area("Transcription of the recorded audio") | |
if transcript: | |
# Create speaker profile from recorded audio | |
speaker = interface.create_speaker(ref_audio_path, transcript) | |
# Save the speaker profile | |
interface.save_speaker(speaker, "speaker.json") | |
text_input = st.text_area("Text to convert to speech:", "Hello, this is an AI-generated voice.") | |
if st.button("Generate Speech"): | |
with st.spinner("Generating audio..."): | |
# Generate speech with or without the speaker profile | |
output = interface.generate( | |
text=text_input, | |
temperature=0.1, | |
repetition_penalty=1.1, | |
max_length=4096, | |
speaker=speaker | |
) | |
# Save the synthesized speech to a file | |
output_path = "output.wav" | |
output.save(output_path) | |
# Play the audio in the Streamlit app | |
st.audio(output_path, format="audio/wav") | |
st.success("Speech generated successfully!") | |
# Clean up temporary files | |
if reference_audio: | |
os.remove(ref_audio_path) | |