File size: 3,248 Bytes
84b3aae
 
 
3127b1b
 
 
31beb62
 
 
84b3aae
dbe86d4
 
 
b8320e7
dbe86d4
 
b8320e7
 
84b3aae
 
 
 
 
eee2253
 
3127b1b
31beb62
3127b1b
 
 
 
 
 
 
eee2253
31beb62
3127b1b
31beb62
 
 
 
eee2253
31beb62
eee2253
3127b1b
31beb62
 
 
 
 
 
 
 
 
 
 
 
 
3127b1b
 
31beb62
 
 
 
 
 
 
 
3127b1b
84b3aae
 
 
 
31beb62
b8320e7
 
 
 
eee2253
31beb62
b8320e7
 
 
84b3aae
b8320e7
 
 
84b3aae
 
3127b1b
 
31beb62
3127b1b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import streamlit as st
import outetts
from scipy.io.wavfile import write
import tempfile
import os
from pydub import AudioSegment
import sounddevice as sd
import wave
import numpy as np

# Initialize model configuration
model_config = outetts.HFModelConfig_v1(
    model_path="OuteAI/OuteTTS-0.2-500M",
    language="en"  # Supported languages: en, zh, ja, ko
)

# Initialize the interface
interface = outetts.InterfaceHF(model_version="0.2", cfg=model_config)

# Streamlit UI
st.title("OuteTTS Speech Synthesis")
st.write("Enter text below to generate speech.")

# Sidebar for reference voice
st.sidebar.title("Voice Cloning")
reference_audio = st.sidebar.file_uploader("Upload a reference audio (any format)", type=["wav", "mp3", "ogg", "flac", "m4a"])
transcript = st.sidebar.text_area("Transcription of the reference audio")

# Function to convert audio to WAV format
def convert_to_wav(audio_file):
    temp_audio = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
    audio = AudioSegment.from_file(audio_file)
    audio.export(temp_audio.name, format="wav")
    return temp_audio.name

if reference_audio and transcript:
    ref_audio_path = convert_to_wav(reference_audio)
    # Create speaker profile
    speaker = interface.create_speaker(ref_audio_path, transcript)
    # Save the speaker profile
    interface.save_speaker(speaker, "speaker.json")
else:
    speaker = None

# Recording functionality
def record_audio(duration=5, samplerate=44100):
    st.sidebar.write("Recording...")
    recording = sd.rec(int(duration * samplerate), samplerate=samplerate, channels=1, dtype=np.int16)
    sd.wait()
    temp_audio_path = tempfile.NamedTemporaryFile(delete=False, suffix=".wav").name
    with wave.open(temp_audio_path, "wb") as wf:
        wf.setnchannels(1)
        wf.setsampwidth(2)
        wf.setframerate(samplerate)
        wf.writeframes(recording.tobytes())
    return temp_audio_path

if not speaker:
    st.sidebar.write("Or record your voice below:")
    if st.sidebar.button("Record Voice"):
        ref_audio_path = record_audio()
        st.sidebar.success("Recording complete!")
        transcript = st.sidebar.text_area("Transcription of the recorded audio")
        if transcript:
            # Create speaker profile from recorded audio
            speaker = interface.create_speaker(ref_audio_path, transcript)
            # Save the speaker profile
            interface.save_speaker(speaker, "speaker.json")

text_input = st.text_area("Text to convert to speech:", "Hello, this is an AI-generated voice.")

if st.button("Generate Speech"):
    with st.spinner("Generating audio..."):
        # Generate speech with or without the speaker profile
        output = interface.generate(
            text=text_input,
            temperature=0.1,
            repetition_penalty=1.1,
            max_length=4096,
            speaker=speaker
        )
        
        # Save the synthesized speech to a file
        output_path = "output.wav"
        output.save(output_path)
        
        # Play the audio in the Streamlit app
        st.audio(output_path, format="audio/wav")
        st.success("Speech generated successfully!")

# Clean up temporary files
if reference_audio:
    os.remove(ref_audio_path)