Voice_clone / app.py
Hammad112's picture
Update app.py
31beb62 verified
raw
history blame
3.25 kB
import streamlit as st
import outetts
from scipy.io.wavfile import write
import tempfile
import os
from pydub import AudioSegment
import sounddevice as sd
import wave
import numpy as np
# Initialize model configuration
model_config = outetts.HFModelConfig_v1(
model_path="OuteAI/OuteTTS-0.2-500M",
language="en" # Supported languages: en, zh, ja, ko
)
# Initialize the interface
interface = outetts.InterfaceHF(model_version="0.2", cfg=model_config)
# Streamlit UI
st.title("OuteTTS Speech Synthesis")
st.write("Enter text below to generate speech.")
# Sidebar for reference voice
st.sidebar.title("Voice Cloning")
reference_audio = st.sidebar.file_uploader("Upload a reference audio (any format)", type=["wav", "mp3", "ogg", "flac", "m4a"])
transcript = st.sidebar.text_area("Transcription of the reference audio")
# Function to convert audio to WAV format
def convert_to_wav(audio_file):
temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
audio = AudioSegment.from_file(audio_file)
audio.export(temp_audio.name, format="wav")
return temp_audio.name
if reference_audio and transcript:
ref_audio_path = convert_to_wav(reference_audio)
# Create speaker profile
speaker = interface.create_speaker(ref_audio_path, transcript)
# Save the speaker profile
interface.save_speaker(speaker, "speaker.json")
else:
speaker = None
# Recording functionality
def record_audio(duration=5, samplerate=44100):
st.sidebar.write("Recording...")
recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype=np.int16)
sd.wait()
temp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
with wave.open(temp_audio_path, "wb") as wf:
wf.setnchannels(1)
wf.setsampwidth(2)
wf.setframerate(samplerate)
wf.writeframes(recording.tobytes())
return temp_audio_path
if not speaker:
st.sidebar.write("Or record your voice below:")
if st.sidebar.button("Record Voice"):
ref_audio_path = record_audio()
st.sidebar.success("Recording complete!")
transcript = st.sidebar.text_area("Transcription of the recorded audio")
if transcript:
# Create speaker profile from recorded audio
speaker = interface.create_speaker(ref_audio_path, transcript)
# Save the speaker profile
interface.save_speaker(speaker, "speaker.json")
text_input = st.text_area("Text to convert to speech:", "Hello, this is an AI-generated voice.")
if st.button("Generate Speech"):
with st.spinner("Generating audio..."):
# Generate speech with or without the speaker profile
output = interface.generate(
text=text_input,
temperature=0.1,
repetition_penalty=1.1,
max_length=4096,
speaker=speaker
)
# Save the synthesized speech to a file
output_path = "output.wav"
output.save(output_path)
# Play the audio in the Streamlit app
st.audio(output_path, format="audio/wav")
st.success("Speech generated successfully!")
# Clean up temporary files
if reference_audio:
os.remove(ref_audio_path)