Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
@@ -1,217 +1,125 @@
|
|
1 |
import os
|
2 |
-
import platform
|
3 |
-
import uuid
|
4 |
-
import shutil
|
5 |
-
from pydub import AudioSegment
|
6 |
-
import spaces
|
7 |
-
import torch
|
8 |
import gradio as gr
|
9 |
-
from
|
10 |
|
11 |
-
|
12 |
-
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
from src.generate_batch import get_data
|
17 |
-
from src.generate_facerender_batch import get_facerender_data
|
18 |
-
from src.utils.init_path import init_path
|
19 |
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
) else "mps" if platform.system() == 'Darwin' else "cpu"
|
24 |
-
|
25 |
-
os.environ['TORCH_HOME'] = checkpoint_path
|
26 |
-
snapshot_download(repo_id='vinthony/SadTalker-V002rc',
|
27 |
-
local_dir=checkpoint_path, local_dir_use_symlinks=True)
|
28 |
-
|
29 |
-
|
30 |
-
def mp3_to_wav(mp3_filename, wav_filename, frame_rate):
|
31 |
-
AudioSegment.from_file(file=mp3_filename).set_frame_rate(
|
32 |
-
frame_rate).export(wav_filename, format="wav")
|
33 |
-
|
34 |
-
|
35 |
-
@spaces.GPU(duration=120)
|
36 |
-
def generate_video(source_image, driven_audio, preprocess='crop', still_mode=False, use_enhancer=False,
|
37 |
-
batch_size=1, size=256, pose_style=0, facerender='facevid2vid', exp_scale=1.0,
|
38 |
-
use_ref_video=False, ref_video=None, ref_info=None, use_idle_mode=False,
|
39 |
-
length_of_audio=0, use_blink=True, result_dir='./results/'):
|
40 |
-
# Initialize models and paths
|
41 |
-
sadtalker_paths = init_path(
|
42 |
-
checkpoint_path, config_path, size, False, preprocess)
|
43 |
-
audio_to_coeff = Audio2Coeff(sadtalker_paths, device)
|
44 |
-
preprocess_model = CropAndExtract(sadtalker_paths, device)
|
45 |
-
animate_from_coeff = AnimateFromCoeff(sadtalker_paths, device) if facerender == 'facevid2vid' and device != 'mps' \
|
46 |
-
else AnimateFromCoeff_PIRender(sadtalker_paths, device)
|
47 |
-
|
48 |
-
# Create directories for saving results
|
49 |
-
time_tag = str(uuid.uuid4())
|
50 |
-
save_dir = os.path.join(result_dir, time_tag)
|
51 |
-
os.makedirs(save_dir, exist_ok=True)
|
52 |
-
input_dir = os.path.join(save_dir, 'input')
|
53 |
-
os.makedirs(input_dir, exist_ok=True)
|
54 |
-
|
55 |
-
# Process source image
|
56 |
-
pic_path = os.path.join(input_dir, os.path.basename(source_image))
|
57 |
-
shutil.move(source_image, input_dir)
|
58 |
-
|
59 |
-
# Process driven audio
|
60 |
-
if driven_audio and os.path.isfile(driven_audio):
|
61 |
-
audio_path = os.path.join(input_dir, os.path.basename(driven_audio))
|
62 |
-
if '.mp3' in audio_path:
|
63 |
-
mp3_to_wav(driven_audio, audio_path.replace('.mp3', '.wav'), 16000)
|
64 |
-
audio_path = audio_path.replace('.mp3', '.wav')
|
65 |
-
else:
|
66 |
-
shutil.move(driven_audio, input_dir)
|
67 |
-
elif use_idle_mode:
|
68 |
-
audio_path = os.path.join(
|
69 |
-
input_dir, 'idlemode_'+str(length_of_audio)+'.wav')
|
70 |
-
AudioSegment.silent(
|
71 |
-
duration=1000*length_of_audio).export(audio_path, format="wav")
|
72 |
else:
|
73 |
-
|
74 |
|
75 |
-
|
76 |
-
if
|
77 |
-
|
78 |
-
audio_path = os.path.join(save_dir, ref_video_videoname+'.wav')
|
79 |
-
os.system(
|
80 |
-
f"ffmpeg -y -hide_banner -loglevel error -i {ref_video} {audio_path}")
|
81 |
-
ref_video_frame_dir = os.path.join(save_dir, ref_video_videoname)
|
82 |
-
os.makedirs(ref_video_frame_dir, exist_ok=True)
|
83 |
-
ref_video_coeff_path, _, _ = preprocess_model.generate(
|
84 |
-
ref_video, ref_video_frame_dir, preprocess, source_image_flag=False)
|
85 |
else:
|
86 |
-
|
87 |
|
88 |
-
|
89 |
-
|
90 |
-
os.makedirs(first_frame_dir, exist_ok=True)
|
91 |
-
first_coeff_path, crop_pic_path, crop_info = preprocess_model.generate(
|
92 |
-
pic_path, first_frame_dir, preprocess, True, size)
|
93 |
-
if first_coeff_path is None:
|
94 |
-
raise AttributeError("No face is detected")
|
95 |
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
-
elif ref_info == 'blink':
|
102 |
-
ref_eyeblink_coeff_path = ref_video_coeff_path
|
103 |
-
elif ref_info == 'pose+blink':
|
104 |
-
ref_pose_coeff_path = ref_eyeblink_coeff_path = ref_video_coeff_path
|
105 |
-
else:
|
106 |
-
ref_pose_coeff_path = ref_eyeblink_coeff_path = None
|
107 |
-
|
108 |
-
# Generate coefficients from audio or reference video
|
109 |
-
if use_ref_video and ref_info == 'all':
|
110 |
-
coeff_path = ref_video_coeff_path
|
111 |
-
else:
|
112 |
-
batch = get_data(first_coeff_path, audio_path, device, ref_eyeblink_coeff_path=ref_eyeblink_coeff_path,
|
113 |
-
still=still_mode, idlemode=use_idle_mode, length_of_audio=length_of_audio, use_blink=use_blink)
|
114 |
-
coeff_path = audio_to_coeff.generate(
|
115 |
-
batch, save_dir, pose_style, ref_pose_coeff_path)
|
116 |
-
|
117 |
-
# Generate video from coefficients
|
118 |
-
data = get_facerender_data(coeff_path, crop_pic_path, first_coeff_path, audio_path, batch_size, still_mode=still_mode,
|
119 |
-
preprocess=preprocess, size=size, expression_scale=exp_scale, facemodel=facerender)
|
120 |
-
return_path = animate_from_coeff.generate(data, save_dir, pic_path, crop_info, enhancer='gfpgan' if use_enhancer else None,
|
121 |
-
preprocess=preprocess, img_size=size)
|
122 |
-
video_name = data['video_name']
|
123 |
-
print(f'The generated video is named {video_name} in {save_dir}')
|
124 |
-
|
125 |
-
return return_path
|
126 |
|
127 |
-
|
128 |
-
|
129 |
-
with gr.
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
with gr.TabItem('Source image'):
|
134 |
-
with gr.Row():
|
135 |
-
source_image = gr.Image(
|
136 |
-
label="Source image", sources="upload", type="filepath", elem_id="img2img_image")
|
137 |
|
138 |
with gr.Tabs(elem_id="sadtalker_driven_audio"):
|
139 |
-
with gr.TabItem(
|
140 |
-
gr.
|
141 |
-
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
166 |
-
|
167 |
-
|
168 |
-
|
169 |
-
|
170 |
-
|
171 |
-
|
172 |
-
|
173 |
-
|
174 |
-
|
|
|
|
|
|
|
|
|
175 |
pose_style = gr.Slider(
|
176 |
-
minimum=0, maximum=
|
177 |
-
|
178 |
-
minimum=0, maximum=3, step=0.1, label="expression scale", value=1)
|
179 |
-
blink_every = gr.Checkbox(
|
180 |
-
label="use eye blink", value=True)
|
181 |
-
|
182 |
-
with gr.Row():
|
183 |
size_of_image = gr.Radio(
|
184 |
-
[256, 512],
|
|
|
|
|
|
|
|
|
185 |
preprocess_type = gr.Radio(
|
186 |
-
[
|
187 |
-
|
188 |
-
|
|
|
|
|
189 |
is_still_mode = gr.Checkbox(
|
190 |
-
label="Still Mode (fewer head motion, works with preprocess `full`)"
|
191 |
-
|
192 |
-
['facevid2vid', 'pirender'], value='facevid2vid', label='facerender', info="which face render?")
|
193 |
-
|
194 |
-
with gr.Row():
|
195 |
batch_size = gr.Slider(
|
196 |
-
label="batch size in generation", step=1, maximum=10, value=
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
import gradio as gr
|
3 |
+
from src.gradio_demo import SadTalker
|
4 |
|
5 |
+
try:
|
6 |
+
import webui # in webui
|
7 |
+
IN_WEBUi = True
|
8 |
+
except ImportError:
|
9 |
+
IN_WEBUi = False
|
|
|
|
|
|
|
10 |
|
11 |
+
def toggle_audio_file(choice):
|
12 |
+
if not choice:
|
13 |
+
return gr.update("visible", True), gr.update("visible", False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
14 |
else:
|
15 |
+
return gr.update("visible", False), gr.update("visible", True)
|
16 |
|
17 |
+
def ref_video_fn(path_of_ref_video):
|
18 |
+
if path_of_ref_video:
|
19 |
+
return gr.update("value", True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
20 |
else:
|
21 |
+
return gr.update("value", False)
|
22 |
|
23 |
+
def sadtalker_demo(checkpoint_path="checkpoints", config_path="src/config", warpfn=None):
|
24 |
+
sad_talker = SadTalker(checkpoint_path, config_path, lazy_load=True)
|
|
|
|
|
|
|
|
|
|
|
25 |
|
26 |
+
with gr.Blocks(analytics_enabled=False) as sadtalker_interface:
|
27 |
+
gr.markdown("<div align='center'> <h2> 😭 SadTalker: Learning Realistic 3D Motion Coefficients for Stylized Audio-Driven Single Image Talking Face Animation (CVPR 2023) </span> \
|
28 |
+
<a style='font-size:18px;color: #efefef' href='https://arxiv.org/abs/2211.12194'>Arxiv</a> \
|
29 |
+
<a style='font-size:18px;color: #efefef' href='https://sadtalker.github.io'>Homepage</a> \
|
30 |
+
<a style='font-size:18px;color: #efefef' href='https://github.com/Winfredy/SadTalker'> Github </div>")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
|
32 |
+
with gr.Row(style="equal_height: False"):
|
33 |
+
with gr.Column(variant="panel"):
|
34 |
+
with gr.Tabs(elem_id="sadtalker_source_image"):
|
35 |
+
with gr.TabItem("Upload image"):
|
36 |
+
with gr.Row():
|
37 |
+
source_image = gr.Image(label="Source image", source="upload", type="filepath", elem_id="img2img_image", style="width: 512px")
|
|
|
|
|
|
|
|
|
38 |
|
39 |
with gr.Tabs(elem_id="sadtalker_driven_audio"):
|
40 |
+
with gr.TabItem("Upload OR TTS"):
|
41 |
+
with gr.Column(variant="panel"):
|
42 |
+
driven_audio = gr.Audio(label="Input audio", source="upload", type="filepath")
|
43 |
+
|
44 |
+
if sys.platform != "win32" and not IN_WEBUi:
|
45 |
+
from src.utils.text2speech import TTSTalker
|
46 |
+
|
47 |
+
tts_talker = TTSTalker()
|
48 |
+
with gr.Column(variant="panel"):
|
49 |
+
input_text = gr.Textbox(
|
50 |
+
label="Generating audio from text",
|
51 |
+
lines=5,
|
52 |
+
placeholder="Enter text for audio generation using @Coqui.ai TTS.",
|
53 |
+
)
|
54 |
+
tts = gr.Button("Generate audio", elem_id="sadtalker_audio_generate", variant="primary")
|
55 |
+
tts.click(tts_talker.test, [input_text], [driven_audio])
|
56 |
+
|
57 |
+
with gr.Column(variant="panel"):
|
58 |
+
with gr.Tabs(elem_id="sadtalker_checkbox"):
|
59 |
+
with gr.TabItem("Settings"):
|
60 |
+
gr.markdown(
|
61 |
+
"Need help? Visit our [best practice page](https://github.com/OpenTalker/SadTalker/blob/main/docs/best_practice.md) for details."
|
62 |
+
)
|
63 |
+
with gr.Column(variant="panel"):
|
64 |
+
# width = gr.Slider(
|
65 |
+
# minimum=64,
|
66 |
+
# elem_id="img2img_width",
|
67 |
+
# maximum=2048,
|
68 |
+
# step=8,
|
69 |
+
# label="Manually Crop Width",
|
70 |
+
# value=512,
|
71 |
+
# ) # img2img_width
|
72 |
+
# height = gr.Slider(
|
73 |
+
# minimum=64,
|
74 |
+
# elem_id="img2img_height",
|
75 |
+
# maximum=2048,
|
76 |
+
# step=8,
|
77 |
+
# label="Manually Crop Height",
|
78 |
+
# value=512,
|
79 |
+
# ) # img2img_width
|
80 |
pose_style = gr.Slider(
|
81 |
+
minimum=0, maximum=46, step=1, label="Pose style", value=0
|
82 |
+
)
|
|
|
|
|
|
|
|
|
|
|
83 |
size_of_image = gr.Radio(
|
84 |
+
[256, 512],
|
85 |
+
value=256,
|
86 |
+
label="face model resolution",
|
87 |
+
info="Use 256/512 model?",
|
88 |
+
)
|
89 |
preprocess_type = gr.Radio(
|
90 |
+
["crop", "resize", "full", "extcrop", "extfull"],
|
91 |
+
value="crop",
|
92 |
+
label="preprocess",
|
93 |
+
info="How to handle input image?",
|
94 |
+
)
|
95 |
is_still_mode = gr.Checkbox(
|
96 |
+
label="Still Mode (fewer head motion, works with preprocess `full`)"
|
97 |
+
)
|
|
|
|
|
|
|
98 |
batch_size = gr.Slider(
|
99 |
+
label="batch size in generation", step=1, maximum=10, value=2
|
100 |
+
)
|
101 |
+
enhancer = gr.Checkbox(label="GFPGAN as Face enhancer")
|
102 |
+
submit = gr.Button("Generate", elem_id="sadtalker_generate", variant="primary")
|
103 |
+
|
104 |
+
with gr.Tabs(elem_id="sadtalker_genearted"):
|
105 |
+
gen_video = gr.Video(label="Generated video", format="mp4", style="width: 256px")
|
106 |
+
|
107 |
+
if warpfn:
|
108 |
+
submit.click(
|
109 |
+
warpfn(sad_talker.test),
|
110 |
+
[source_image, driven_audio, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style],
|
111 |
+
[gen_video],
|
112 |
+
)
|
113 |
+
else:
|
114 |
+
submit.click(
|
115 |
+
sad_talker.test,
|
116 |
+
[source_image, driven_audio, preprocess_type, is_still_mode, enhancer, batch_size, size_of_image, pose_style],
|
117 |
+
[gen_video],
|
118 |
+
)
|
119 |
+
|
120 |
+
return sadtalker_interface
|
121 |
+
|
122 |
+
if __name__ == "__main__":
|
123 |
+
demo = sadtalker_demo()
|
124 |
+
demo.queue()
|
125 |
+
demo.launch()
|