File size: 2,319 Bytes
941f415
33bea70
941f415
 
ba596b5
941f415
33bea70
 
 
60a8de0
 
64d37cb
60a8de0
 
 
ba596b5
 
 
 
 
941f415
4ebf4f4
 
 
 
941f415
4ebf4f4
60a8de0
4ebf4f4
941f415
d0b52af
6e383be
d0b52af
 
 
 
 
4ebf4f4
2c4fd8d
 
 
 
d0b52af
2c4fd8d
941f415
d0b52af
 
32d3b7e
1f2644a
cba51d2
ab33db6
1f2644a
cba51d2
1f2644a
cba51d2
1f2644a
d0b52af
1f2644a
d0b52af
 
cba51d2
ab33db6
1f2644a
 
aad4adb
23b908d
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import torch
import os
import streamlit as st
from TTS.api import TTS
from tempfile import NamedTemporaryFile

# By using XTTS you agree to CPML license https://coqui.ai/cpml
os.environ["COQUI_TOS_AGREED"] = "1"

def generate_audio(audio_file, text_input):
  # Initialize model
  model = "tts_models/multilingual/multi-dataset/xtts_v2" #coqui/XTTS-v2
  device = 'cuda' if torch.cuda.is_available() else 'cpu'
  tts = TTS(model).to(device)

  with NamedTemporaryFile(delete=False, suffix='.wav') as tmp_file:
    output_path = tmp_file.name
    tts.tts_to_file(text=text_input, speaker_wav=audio_file, language='en', file_path=output_path)

  return output_path

def main():
  # Title
  title = f"""<h1 align="center" style="font-size: 2rem";>Voice Clone</h1>"""
  st.markdown(title, unsafe_allow_html=True)

  # Subtitle
  title = f"""<h2 align="center" style="font-size: 1.2rem; margin-bottom: 2rem;">Make your favorite characters say anything!</h2>"""
  st.markdown(title, unsafe_allow_html=True)

  sample_files = {
    '': '',
    'Stewie Griffin': 'sample_inputs/stewie.wav',
    'Donald Trump': 'sample_inputs/trump.wav',
    'Joe Rogan': 'sample_inputs/rogan.wav'
  }

  # Upload audio file
  uploaded_file = st.file_uploader('Add an audio (.wav) file of the voice you want to clone...', type=['wav'])

  if uploaded_file is None:
    selected_sample = st.selectbox('Or choose a sample:', list(sample_files.keys()))
  
  speaker_file = uploaded_file if uploaded_file is not None else sample_files[selected_sample]

  if speaker_file:
    st.header('Reference Audio')
    st.audio(speaker_file, format='audio/wav')

    # Input text 
    text_input = st.text_area('What do you want your character to say? Try to keep the prompt around 2 sentences.')

    if st.button('Synthesize'):
      if text_input:
        try:
          with st.spinner('Synthesizing...'):
            output_path = generate_audio(speaker_file, text_input)

          st.header('Synthesized Audio')
          st.audio(output_path, format='audio/wav')
        except:
          st.error('There was an issue synthesizing the text. Please check the input and try again. Try to keep the input around 2 sentences, and less than 200 characters.')
      else:
        st.error('Please provide a text input!')

if __name__ == '__main__':
  main()