text_to_speech_sync_video

Running

File size: 4,454 Bytes

4d909dd
d35ea54
 
 
a8c9931
6abe9d1
 
 
c79a608
6abe9d1
 
d35ea54
 
 
 
4d909dd
9cbe47c
 
7b6b779
 
 
 
 
 
 
78953ab
 
d35ea54
 
 
78953ab
d35ea54
78953ab
d35ea54
 
 
 
 
 
 
 
a8c9931
 
 
e17cc19
 
37f1e48
78953ab
a8c9931
 
 
 
 
37f1e48
a8c9931
37f1e48
 
 
 
ae516d2
78953ab
 
a8c9931
d35ea54
 
 
a8c9931
 
 
 
41d2d4a
 
 
 
a8c9931
78953ab
a8c9931
 
 
 
 
 
 
 
d35ea54
 
 
78953ab
ae516d2
 
 
 
 
78953ab
710c1d8

import streamlit as st
import os
import sys
import torch
import pickle
import numpy
print(numpy.__version__)
import librosa
print(librosa.__version__)
import numpy
print(numpy.__version__)
path_to_add = os.path.join(os.path.dirname(__file__), "Wav2Lip")
if path_to_add not in sys.path:
    sys.path.insert(0, path_to_add)
from avatar import Avatar

options = ['Aude', 'Kyla', 'Liv', 'MC6']
images = ['ref_videos/Aude.png', 'ref_videos/Kyla.png', 'ref_videos/Liv.png', 'ref_videos/MC6.png']
big_text = """
    <div style='text-align: center;'>
        <h1 style='font-size: 30x;'>Text to Speech Synchronized Video</h1>
    </div>
    """
    # Display the styled text
st.markdown(big_text, unsafe_allow_html=True)
current_status_placeholder = st.empty()
init_progress_bar = st.progress(0)
if 'is_initialized' not in st.session_state:
    st.session_state.avatar = Avatar()
    st.session_state.avatar.export_video = False
    current_status_placeholder.write("load model")
    st.session_state.avatar.load_model("checkpoint/wav2lip_gan.pth")
    current_status_placeholder.write("load model finished")
    st.session_state.avatar.device = 'cuda' if torch.cuda.is_available() else 'cpu'
    print(st.session_state.avatar.device)
    st.session_state.avatar.output_audio_path = "audio/"
    st.session_state.avatar.output_audio_filename = "result.wav"
    st.session_state.avatar.temp_lip_video_no_voice_path = "temp/"
    st.session_state.avatar.temp_lip_video_no_voice_filename = "result.avi"
    st.session_state.avatar.output_video_path = "results/"
    st.session_state.avatar.output_video_name = "result_voice.mp4"
    st.session_state.selected_option = "Liv"
    st.session_state.avatar.ref_video_path_and_filename = f"ref_videos/{st.session_state.selected_option}.mp4"

    st.session_state.avatar.get_video_full_frames(st.session_state.avatar.ref_video_path_and_filename)
    st.session_state.avatar.face_detect_batch_size = 16
    avatar.create_face_detection_results(avatar.video_full_frames,True)
    current_status_placeholder.write("load face detection result")
    st.session_state.face_det_results_dict={}
    for option in options:
        with open(f'ref_videos/{option}_face_det_result.pkl', 'rb') as file:
            st.session_state.face_det_results_dict[option] = pickle.load(file)
    st.session_state.avatar.face_detect_img_results =st.session_state.face_det_results_dict[st.session_state.selected_option]
    st.session_state.avatar.face_det_results_path_and_name = 'ref_videos/Liv_face_det_result.pkl'

    st.session_state.avatar.load_face_detection_results()
    def load_face_detection_results(self):
         with open(self.face_det_results_path_and_name, 'rb') as file:
           self.face_detect_img_results = pickle.load(file)
    input_text = "Hi How are you?"
    st.session_state.avatar.text_to_lip_video(input_text,init_progress_bar)
    current_status_placeholder.write("load face detection result done")

    st.session_state['is_initialized'] = True




# Create the radio button group
selected_option = st.radio("Choose an option:", options, index=options.index(st.session_state.selected_option))
img_col1, img_col2 = st.columns([1,1])

with img_col1:
    st.image(images[options.index(selected_option)])


if st.session_state.selected_option != selected_option:
    print("The selected option has changed!")
    st.session_state.selected_option = selected_option
    st.session_state.avatar.ref_video_path_and_filename = f"ref_videos/{st.session_state.selected_option}.mp4"

    st.session_state.avatar.get_video_full_frames(st.session_state.avatar.ref_video_path_and_filename)
    st.session_state.avatar.face_detect_img_results =st.session_state.face_det_results_dict[st.session_state.selected_option]

from avatar import Avatar
# Create a text input box and store the input in a variable
user_input = st.text_input("Enter your text:")
inference_progress_bar = st.progress(0)
if user_input:
    st.session_state.avatar.dir_clean_up()
    # Display the entered text
    st.write("You entered:", user_input)
    st.session_state.avatar.export_video=True
    st.session_state.avatar.text_to_lip_video(user_input,inference_progress_bar)
    col1, col2, col3 = st.columns([1, 4, 1])

    # with col1:
    #     st.write("Column 1 content")

    with col2:
        st.video(st.session_state.avatar.output_video_path + st.session_state.avatar.output_video_name)
    # with col3:
    #     st.write("Column 3 content")