ATP
commited on
Commit
·
8746ace
1
Parent(s):
09f2988
Add application file
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- app.py +177 -0
- audio.py +136 -0
- checkpoints/README.md +1 -0
- color_syncnet_train.py +279 -0
- evaluation/README.md +63 -0
- evaluation/gen_videos_from_filelist.py +238 -0
- evaluation/real_videos_inference.py +305 -0
- evaluation/scores_LSE/SyncNetInstance_calc_scores.py +210 -0
- evaluation/scores_LSE/calculate_scores_LRS.py +53 -0
- evaluation/scores_LSE/calculate_scores_real_videos.py +45 -0
- evaluation/scores_LSE/calculate_scores_real_videos.sh +8 -0
- evaluation/test_filelists/README.md +13 -0
- evaluation/test_filelists/ReSyncED/random_pairs.txt +160 -0
- evaluation/test_filelists/ReSyncED/tts_pairs.txt +18 -0
- evaluation/test_filelists/lrs2.txt +0 -0
- evaluation/test_filelists/lrs3.txt +0 -0
- evaluation/test_filelists/lrw.txt +0 -0
- face_detection/README.md +1 -0
- face_detection/__init__.py +7 -0
- face_detection/__pycache__/__init__.cpython-36.pyc +0 -0
- face_detection/__pycache__/__init__.cpython-38.pyc +0 -0
- face_detection/__pycache__/__init__.cpython-39.pyc +0 -0
- face_detection/__pycache__/api.cpython-36.pyc +0 -0
- face_detection/__pycache__/api.cpython-38.pyc +0 -0
- face_detection/__pycache__/api.cpython-39.pyc +0 -0
- face_detection/__pycache__/models.cpython-36.pyc +0 -0
- face_detection/__pycache__/models.cpython-38.pyc +0 -0
- face_detection/__pycache__/models.cpython-39.pyc +0 -0
- face_detection/__pycache__/utils.cpython-36.pyc +0 -0
- face_detection/__pycache__/utils.cpython-38.pyc +0 -0
- face_detection/__pycache__/utils.cpython-39.pyc +0 -0
- face_detection/api.py +79 -0
- face_detection/detection/__init__.py +1 -0
- face_detection/detection/__pycache__/__init__.cpython-36.pyc +0 -0
- face_detection/detection/__pycache__/__init__.cpython-38.pyc +0 -0
- face_detection/detection/__pycache__/core.cpython-36.pyc +0 -0
- face_detection/detection/__pycache__/core.cpython-38.pyc +0 -0
- face_detection/detection/core.py +130 -0
- face_detection/detection/sfd/__init__.py +1 -0
- face_detection/detection/sfd/__pycache__/__init__.cpython-36.pyc +0 -0
- face_detection/detection/sfd/__pycache__/__init__.cpython-38.pyc +0 -0
- face_detection/detection/sfd/__pycache__/bbox.cpython-36.pyc +0 -0
- face_detection/detection/sfd/__pycache__/bbox.cpython-38.pyc +0 -0
- face_detection/detection/sfd/__pycache__/detect.cpython-36.pyc +0 -0
- face_detection/detection/sfd/__pycache__/detect.cpython-38.pyc +0 -0
- face_detection/detection/sfd/__pycache__/net_s3fd.cpython-36.pyc +0 -0
- face_detection/detection/sfd/__pycache__/net_s3fd.cpython-38.pyc +0 -0
- face_detection/detection/sfd/__pycache__/sfd_detector.cpython-36.pyc +0 -0
- face_detection/detection/sfd/__pycache__/sfd_detector.cpython-38.pyc +0 -0
- face_detection/detection/sfd/bbox.py +129 -0
app.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import gradio as gr
|
2 |
+
import subprocess
|
3 |
+
from transformers import Wav2Vec2Processor, Wav2Vec2ForCTC, AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
|
4 |
+
import torch
|
5 |
+
import librosa
|
6 |
+
import tempfile
|
7 |
+
from neon_tts_plugin_coqui import CoquiTTS
|
8 |
+
from gtts import gTTS
|
9 |
+
from numba import cuda
|
10 |
+
|
11 |
+
#variables
|
12 |
+
language_input_audio = 'en'
|
13 |
+
language_output_audio='ch'
|
14 |
+
dict_lang = {
|
15 |
+
'en': 'eng_latn',
|
16 |
+
'es': 'spa_Latn',
|
17 |
+
'fr': 'fra_Latn',
|
18 |
+
'de': 'deu_Latn',
|
19 |
+
'pl': 'pol_Latn',
|
20 |
+
'uk': 'ukr_Cyrl',
|
21 |
+
'ro': 'ron_Latn',
|
22 |
+
'hu': 'hun_Latn',
|
23 |
+
'bg': 'bul_Cyrl',
|
24 |
+
'nl': 'nld_Latn',
|
25 |
+
'fi': 'fin_Latn',
|
26 |
+
'sl': 'slv_Latn',
|
27 |
+
'lv': 'lvs_Latn',
|
28 |
+
'ga': 'gle_Latn',
|
29 |
+
'ch': 'zho_Hant',
|
30 |
+
'ru': 'rus_Cyrl'
|
31 |
+
}
|
32 |
+
|
33 |
+
#functions
|
34 |
+
def radio_lang_input(lang):
|
35 |
+
language_input_audio = lang
|
36 |
+
return {var: language_input_audio}
|
37 |
+
|
38 |
+
#a function that determines the language of the output audio
|
39 |
+
def radio_input(lang):
|
40 |
+
language_output_audio = lang
|
41 |
+
return {var_lang: language_output_audio}
|
42 |
+
|
43 |
+
##
|
44 |
+
#convert input video file to text, audio, video
|
45 |
+
def video_load(video, language_input_audio, language_output_audio):
|
46 |
+
#convert video to video720p -s 1280x720
|
47 |
+
#
|
48 |
+
subprocess.run(f'ffmpeg -y -i {video} -vf scale=720:-2 video720p.mp4', shell=True)
|
49 |
+
#convert video to audio
|
50 |
+
#
|
51 |
+
subprocess.run('ffmpeg -y -i video720p.mp4 -vn -ar 16000 -ac 2 -ab 192K -f wav sound_from_input_video.wav', shell=True)
|
52 |
+
#convert audio to text
|
53 |
+
#
|
54 |
+
# load model and tokenizer
|
55 |
+
if language_input_audio == 'en':
|
56 |
+
processor = Wav2Vec2Processor.from_pretrained("facebook/wav2vec2-base-960h")
|
57 |
+
model = Wav2Vec2ForCTC.from_pretrained("facebook/wav2vec2-base-960h")
|
58 |
+
audio, rate = librosa.load('sound_from_input_video.wav', sr = 16000)
|
59 |
+
input_values = processor(audio, sampling_rate=rate, return_tensors="pt", padding="longest").input_values
|
60 |
+
# retrieve logits
|
61 |
+
logits = model(input_values).logits
|
62 |
+
# take argmax and decode
|
63 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
64 |
+
transcription = processor.batch_decode(predicted_ids)[0]
|
65 |
+
if language_input_audio == 'ru':
|
66 |
+
processor = Wav2Vec2Processor.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
|
67 |
+
model = Wav2Vec2ForCTC.from_pretrained("jonatasgrosman/wav2vec2-large-xlsr-53-russian")
|
68 |
+
audio, rate = librosa.load('sound_from_input_video.wav', sr = 16000)
|
69 |
+
input_values = processor(audio, sampling_rate=rate, return_tensors="pt", padding="longest").input_values
|
70 |
+
# retrieve logits
|
71 |
+
logits = model(input_values).logits
|
72 |
+
# take argmax and decode
|
73 |
+
predicted_ids = torch.argmax(logits, dim=-1)
|
74 |
+
transcription = processor.batch_decode(predicted_ids)[0]
|
75 |
+
#convert text to text translations
|
76 |
+
#
|
77 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("facebook/nllb-200-distilled-600M")
|
78 |
+
tokenizer = AutoTokenizer.from_pretrained("facebook/nllb-200-distilled-600M")
|
79 |
+
device = 0 if torch.cuda.is_available() else -1
|
80 |
+
translation_pipeline = pipeline("translation", model=model, tokenizer=tokenizer, src_lang=dict_lang[language_input_audio], tgt_lang=dict_lang[language_output_audio], max_length=2000000, device=-1)
|
81 |
+
result = translation_pipeline(transcription)
|
82 |
+
text_translations = result[0]['translation_text']
|
83 |
+
#convert text to audio
|
84 |
+
#
|
85 |
+
#ru
|
86 |
+
if language_output_audio == 'ru':
|
87 |
+
tts = gTTS(text_translations, lang='ru')
|
88 |
+
tts.save('ru.mp3')
|
89 |
+
audio = 'ru.mp3'
|
90 |
+
#Vashington obcom
|
91 |
+
if language_output_audio in ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']:
|
92 |
+
coquiTTS = CoquiTTS()
|
93 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
94 |
+
coquiTTS.get_tts(text_translations, fp, speaker = {"language" : language_output_audio})
|
95 |
+
audio = fp.name
|
96 |
+
#Chineese
|
97 |
+
if language_output_audio == 'ch':
|
98 |
+
tts = gTTS(text_translations, lang='zh-CN')
|
99 |
+
tts.save('china.mp3')
|
100 |
+
audio = 'china.mp3'
|
101 |
+
#audio to video
|
102 |
+
#
|
103 |
+
subprocess.run(f'python inference.py --checkpoint_path wav2lip_gan.pth --face video720p.mp4 --audio {audio} --nosmooth --pads 0 20 0 0', shell=True)
|
104 |
+
video = 'results/result_voice.mp4'
|
105 |
+
return text_translations, audio, video
|
106 |
+
|
107 |
+
##
|
108 |
+
# function for create video from audio
|
109 |
+
def audio_to_video_custom(audio):
|
110 |
+
subprocess.run(f'python inference.py --checkpoint_path wav2lip_gan.pth --face video720p.mp4 --audio {audio} --nosmooth --pads 0 20 0 0', shell=True)
|
111 |
+
video = 'results/result_voice.mp4'
|
112 |
+
return video
|
113 |
+
|
114 |
+
##
|
115 |
+
# function for create audio from custom translations
|
116 |
+
def text_to_audio_custom(text_translations, language_output_audio):
|
117 |
+
#ru
|
118 |
+
if language_output_audio == 'ru':
|
119 |
+
tts = gTTS(text_translations, lang='ru')
|
120 |
+
tts.save('ru.mp3')
|
121 |
+
audio = 'ru.mp3'
|
122 |
+
#Vashington obcom
|
123 |
+
if language_output_audio in ['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga']:
|
124 |
+
coquiTTS = CoquiTTS()
|
125 |
+
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
|
126 |
+
coquiTTS.get_tts(text_translations, fp, speaker = {"language" : language_output_audio})
|
127 |
+
audio = fp.name
|
128 |
+
|
129 |
+
#Chineese
|
130 |
+
if language_output_audio == 'ch':
|
131 |
+
tts = gTTS(text_translations, lang='zh-CN')
|
132 |
+
tts.save('china.mp3')
|
133 |
+
audio = 'china.mp3'
|
134 |
+
return audio
|
135 |
+
|
136 |
+
##### blocks
|
137 |
+
with gr.Blocks(title="Speak video in any language") as demo:
|
138 |
+
# state variable
|
139 |
+
var = gr.State('en')
|
140 |
+
var_lang = gr.State('ch')
|
141 |
+
# markdown text
|
142 |
+
gr.Markdown("Service for translating videos into other languages with support for the speaker's facial expressions")
|
143 |
+
gr.Markdown("The uploaded video must be only with a face. Preferably without sudden movements of the head.")
|
144 |
+
with gr.Row():
|
145 |
+
with gr.Column():
|
146 |
+
# radio button for change input lang
|
147 |
+
radio_input_lang_video = gr.Radio(['en', 'ru'], value="en", label='Select input video language')
|
148 |
+
# video input
|
149 |
+
seed = gr.Video(label="Input Video")
|
150 |
+
# radio button for change to output language
|
151 |
+
radio = gr.Radio(['en', 'es', 'fr', 'de', 'pl', 'uk', 'ro', 'hu', 'bg', 'nl', 'fi', 'sl', 'lv', 'ga', 'ch', 'ru'], value="ch", label='Choose the language you want to speak')
|
152 |
+
# main button
|
153 |
+
btn_1 = gr.Button("1. Generate video with translated audio")
|
154 |
+
|
155 |
+
with gr.Column():
|
156 |
+
# text output
|
157 |
+
translations_text = gr.Text(label="Generated Translations Text", interactive=True)
|
158 |
+
# button to generate text to audio
|
159 |
+
btn_3 = gr.Button("Generate custom translations to speech")
|
160 |
+
# output audio
|
161 |
+
translations_audio = gr.Audio(label="Generated Translations Audio", interactive=True, type="filepath")
|
162 |
+
# button to generate audio to video
|
163 |
+
btn_2 = gr.Button("Generate video with custom audio")
|
164 |
+
# video output
|
165 |
+
video_output = gr.Video(interactive=False, label="Generated Translations Video")
|
166 |
+
# change input lang video
|
167 |
+
radio_input_lang_video.change(fn=radio_lang_input, inputs=radio_input_lang_video, outputs=var)
|
168 |
+
# change output lang
|
169 |
+
radio.change(fn=radio_input, inputs=radio, outputs=var_lang)
|
170 |
+
# main button click
|
171 |
+
btn_1.click(video_load, inputs=[seed, var, var_lang], outputs=[translations_text, translations_audio, video_output])
|
172 |
+
# button click to custom audio to video
|
173 |
+
btn_2.click(audio_to_video_custom, inputs=[translations_audio], outputs=[video_output])
|
174 |
+
# button click to custom test to audio
|
175 |
+
btn_3.click(text_to_audio_custom, inputs=[translations_text, var_lang], outputs=[translations_audio])
|
176 |
+
|
177 |
+
demo.launch(show_api=False)
|
audio.py
ADDED
@@ -0,0 +1,136 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import librosa
|
2 |
+
import librosa.filters
|
3 |
+
import numpy as np
|
4 |
+
# import tensorflow as tf
|
5 |
+
from scipy import signal
|
6 |
+
from scipy.io import wavfile
|
7 |
+
from hparams import hparams as hp
|
8 |
+
|
9 |
+
def load_wav(path, sr):
|
10 |
+
return librosa.core.load(path, sr=sr)[0]
|
11 |
+
|
12 |
+
def save_wav(wav, path, sr):
|
13 |
+
wav *= 32767 / max(0.01, np.max(np.abs(wav)))
|
14 |
+
#proposed by @dsmiller
|
15 |
+
wavfile.write(path, sr, wav.astype(np.int16))
|
16 |
+
|
17 |
+
def save_wavenet_wav(wav, path, sr):
|
18 |
+
librosa.output.write_wav(path, wav, sr=sr)
|
19 |
+
|
20 |
+
def preemphasis(wav, k, preemphasize=True):
|
21 |
+
if preemphasize:
|
22 |
+
return signal.lfilter([1, -k], [1], wav)
|
23 |
+
return wav
|
24 |
+
|
25 |
+
def inv_preemphasis(wav, k, inv_preemphasize=True):
|
26 |
+
if inv_preemphasize:
|
27 |
+
return signal.lfilter([1], [1, -k], wav)
|
28 |
+
return wav
|
29 |
+
|
30 |
+
def get_hop_size():
|
31 |
+
hop_size = hp.hop_size
|
32 |
+
if hop_size is None:
|
33 |
+
assert hp.frame_shift_ms is not None
|
34 |
+
hop_size = int(hp.frame_shift_ms / 1000 * hp.sample_rate)
|
35 |
+
return hop_size
|
36 |
+
|
37 |
+
def linearspectrogram(wav):
|
38 |
+
D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
|
39 |
+
S = _amp_to_db(np.abs(D)) - hp.ref_level_db
|
40 |
+
|
41 |
+
if hp.signal_normalization:
|
42 |
+
return _normalize(S)
|
43 |
+
return S
|
44 |
+
|
45 |
+
def melspectrogram(wav):
|
46 |
+
D = _stft(preemphasis(wav, hp.preemphasis, hp.preemphasize))
|
47 |
+
S = _amp_to_db(_linear_to_mel(np.abs(D))) - hp.ref_level_db
|
48 |
+
|
49 |
+
if hp.signal_normalization:
|
50 |
+
return _normalize(S)
|
51 |
+
return S
|
52 |
+
|
53 |
+
def _lws_processor():
|
54 |
+
import lws
|
55 |
+
return lws.lws(hp.n_fft, get_hop_size(), fftsize=hp.win_size, mode="speech")
|
56 |
+
|
57 |
+
def _stft(y):
|
58 |
+
if hp.use_lws:
|
59 |
+
return _lws_processor(hp).stft(y).T
|
60 |
+
else:
|
61 |
+
return librosa.stft(y=y, n_fft=hp.n_fft, hop_length=get_hop_size(), win_length=hp.win_size)
|
62 |
+
|
63 |
+
##########################################################
|
64 |
+
#Those are only correct when using lws!!! (This was messing with Wavenet quality for a long time!)
|
65 |
+
def num_frames(length, fsize, fshift):
|
66 |
+
"""Compute number of time frames of spectrogram
|
67 |
+
"""
|
68 |
+
pad = (fsize - fshift)
|
69 |
+
if length % fshift == 0:
|
70 |
+
M = (length + pad * 2 - fsize) // fshift + 1
|
71 |
+
else:
|
72 |
+
M = (length + pad * 2 - fsize) // fshift + 2
|
73 |
+
return M
|
74 |
+
|
75 |
+
|
76 |
+
def pad_lr(x, fsize, fshift):
|
77 |
+
"""Compute left and right padding
|
78 |
+
"""
|
79 |
+
M = num_frames(len(x), fsize, fshift)
|
80 |
+
pad = (fsize - fshift)
|
81 |
+
T = len(x) + 2 * pad
|
82 |
+
r = (M - 1) * fshift + fsize - T
|
83 |
+
return pad, pad + r
|
84 |
+
##########################################################
|
85 |
+
#Librosa correct padding
|
86 |
+
def librosa_pad_lr(x, fsize, fshift):
|
87 |
+
return 0, (x.shape[0] // fshift + 1) * fshift - x.shape[0]
|
88 |
+
|
89 |
+
# Conversions
|
90 |
+
_mel_basis = None
|
91 |
+
|
92 |
+
def _linear_to_mel(spectogram):
|
93 |
+
global _mel_basis
|
94 |
+
if _mel_basis is None:
|
95 |
+
_mel_basis = _build_mel_basis()
|
96 |
+
return np.dot(_mel_basis, spectogram)
|
97 |
+
|
98 |
+
def _build_mel_basis():
|
99 |
+
assert hp.fmax <= hp.sample_rate // 2
|
100 |
+
return librosa.filters.mel(hp.sample_rate, hp.n_fft, n_mels=hp.num_mels,
|
101 |
+
fmin=hp.fmin, fmax=hp.fmax)
|
102 |
+
|
103 |
+
def _amp_to_db(x):
|
104 |
+
min_level = np.exp(hp.min_level_db / 20 * np.log(10))
|
105 |
+
return 20 * np.log10(np.maximum(min_level, x))
|
106 |
+
|
107 |
+
def _db_to_amp(x):
|
108 |
+
return np.power(10.0, (x) * 0.05)
|
109 |
+
|
110 |
+
def _normalize(S):
|
111 |
+
if hp.allow_clipping_in_normalization:
|
112 |
+
if hp.symmetric_mels:
|
113 |
+
return np.clip((2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value,
|
114 |
+
-hp.max_abs_value, hp.max_abs_value)
|
115 |
+
else:
|
116 |
+
return np.clip(hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db)), 0, hp.max_abs_value)
|
117 |
+
|
118 |
+
assert S.max() <= 0 and S.min() - hp.min_level_db >= 0
|
119 |
+
if hp.symmetric_mels:
|
120 |
+
return (2 * hp.max_abs_value) * ((S - hp.min_level_db) / (-hp.min_level_db)) - hp.max_abs_value
|
121 |
+
else:
|
122 |
+
return hp.max_abs_value * ((S - hp.min_level_db) / (-hp.min_level_db))
|
123 |
+
|
124 |
+
def _denormalize(D):
|
125 |
+
if hp.allow_clipping_in_normalization:
|
126 |
+
if hp.symmetric_mels:
|
127 |
+
return (((np.clip(D, -hp.max_abs_value,
|
128 |
+
hp.max_abs_value) + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value))
|
129 |
+
+ hp.min_level_db)
|
130 |
+
else:
|
131 |
+
return ((np.clip(D, 0, hp.max_abs_value) * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
|
132 |
+
|
133 |
+
if hp.symmetric_mels:
|
134 |
+
return (((D + hp.max_abs_value) * -hp.min_level_db / (2 * hp.max_abs_value)) + hp.min_level_db)
|
135 |
+
else:
|
136 |
+
return ((D * -hp.min_level_db / hp.max_abs_value) + hp.min_level_db)
|
checkpoints/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Place all your checkpoints (.pth files) here.
|
color_syncnet_train.py
ADDED
@@ -0,0 +1,279 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from os.path import dirname, join, basename, isfile
|
2 |
+
from tqdm import tqdm
|
3 |
+
|
4 |
+
from models import SyncNet_color as SyncNet
|
5 |
+
import audio
|
6 |
+
|
7 |
+
import torch
|
8 |
+
from torch import nn
|
9 |
+
from torch import optim
|
10 |
+
import torch.backends.cudnn as cudnn
|
11 |
+
from torch.utils import data as data_utils
|
12 |
+
import numpy as np
|
13 |
+
|
14 |
+
from glob import glob
|
15 |
+
|
16 |
+
import os, random, cv2, argparse
|
17 |
+
from hparams import hparams, get_image_list
|
18 |
+
|
19 |
+
parser = argparse.ArgumentParser(description='Code to train the expert lip-sync discriminator')
|
20 |
+
|
21 |
+
parser.add_argument("--data_root", help="Root folder of the preprocessed LRS2 dataset", required=True)
|
22 |
+
|
23 |
+
parser.add_argument('--checkpoint_dir', help='Save checkpoints to this directory', required=True, type=str)
|
24 |
+
parser.add_argument('--checkpoint_path', help='Resumed from this checkpoint', default=None, type=str)
|
25 |
+
|
26 |
+
args = parser.parse_args()
|
27 |
+
|
28 |
+
|
29 |
+
global_step = 0
|
30 |
+
global_epoch = 0
|
31 |
+
use_cuda = torch.cuda.is_available()
|
32 |
+
print('use_cuda: {}'.format(use_cuda))
|
33 |
+
|
34 |
+
syncnet_T = 5
|
35 |
+
syncnet_mel_step_size = 16
|
36 |
+
|
37 |
+
class Dataset(object):
|
38 |
+
def __init__(self, split):
|
39 |
+
self.all_videos = get_image_list(args.data_root, split)
|
40 |
+
|
41 |
+
def get_frame_id(self, frame):
|
42 |
+
return int(basename(frame).split('.')[0])
|
43 |
+
|
44 |
+
def get_window(self, start_frame):
|
45 |
+
start_id = self.get_frame_id(start_frame)
|
46 |
+
vidname = dirname(start_frame)
|
47 |
+
|
48 |
+
window_fnames = []
|
49 |
+
for frame_id in range(start_id, start_id + syncnet_T):
|
50 |
+
frame = join(vidname, '{}.jpg'.format(frame_id))
|
51 |
+
if not isfile(frame):
|
52 |
+
return None
|
53 |
+
window_fnames.append(frame)
|
54 |
+
return window_fnames
|
55 |
+
|
56 |
+
def crop_audio_window(self, spec, start_frame):
|
57 |
+
# num_frames = (T x hop_size * fps) / sample_rate
|
58 |
+
start_frame_num = self.get_frame_id(start_frame)
|
59 |
+
start_idx = int(80. * (start_frame_num / float(hparams.fps)))
|
60 |
+
|
61 |
+
end_idx = start_idx + syncnet_mel_step_size
|
62 |
+
|
63 |
+
return spec[start_idx : end_idx, :]
|
64 |
+
|
65 |
+
|
66 |
+
def __len__(self):
|
67 |
+
return len(self.all_videos)
|
68 |
+
|
69 |
+
def __getitem__(self, idx):
|
70 |
+
while 1:
|
71 |
+
idx = random.randint(0, len(self.all_videos) - 1)
|
72 |
+
vidname = self.all_videos[idx]
|
73 |
+
|
74 |
+
img_names = list(glob(join(vidname, '*.jpg')))
|
75 |
+
if len(img_names) <= 3 * syncnet_T:
|
76 |
+
continue
|
77 |
+
img_name = random.choice(img_names)
|
78 |
+
wrong_img_name = random.choice(img_names)
|
79 |
+
while wrong_img_name == img_name:
|
80 |
+
wrong_img_name = random.choice(img_names)
|
81 |
+
|
82 |
+
if random.choice([True, False]):
|
83 |
+
y = torch.ones(1).float()
|
84 |
+
chosen = img_name
|
85 |
+
else:
|
86 |
+
y = torch.zeros(1).float()
|
87 |
+
chosen = wrong_img_name
|
88 |
+
|
89 |
+
window_fnames = self.get_window(chosen)
|
90 |
+
if window_fnames is None:
|
91 |
+
continue
|
92 |
+
|
93 |
+
window = []
|
94 |
+
all_read = True
|
95 |
+
for fname in window_fnames:
|
96 |
+
img = cv2.imread(fname)
|
97 |
+
if img is None:
|
98 |
+
all_read = False
|
99 |
+
break
|
100 |
+
try:
|
101 |
+
img = cv2.resize(img, (hparams.img_size, hparams.img_size))
|
102 |
+
except Exception as e:
|
103 |
+
all_read = False
|
104 |
+
break
|
105 |
+
|
106 |
+
window.append(img)
|
107 |
+
|
108 |
+
if not all_read: continue
|
109 |
+
|
110 |
+
try:
|
111 |
+
wavpath = join(vidname, "audio.wav")
|
112 |
+
wav = audio.load_wav(wavpath, hparams.sample_rate)
|
113 |
+
|
114 |
+
orig_mel = audio.melspectrogram(wav).T
|
115 |
+
except Exception as e:
|
116 |
+
continue
|
117 |
+
|
118 |
+
mel = self.crop_audio_window(orig_mel.copy(), img_name)
|
119 |
+
|
120 |
+
if (mel.shape[0] != syncnet_mel_step_size):
|
121 |
+
continue
|
122 |
+
|
123 |
+
# H x W x 3 * T
|
124 |
+
x = np.concatenate(window, axis=2) / 255.
|
125 |
+
x = x.transpose(2, 0, 1)
|
126 |
+
x = x[:, x.shape[1]//2:]
|
127 |
+
|
128 |
+
x = torch.FloatTensor(x)
|
129 |
+
mel = torch.FloatTensor(mel.T).unsqueeze(0)
|
130 |
+
|
131 |
+
return x, mel, y
|
132 |
+
|
133 |
+
logloss = nn.BCELoss()
|
134 |
+
def cosine_loss(a, v, y):
|
135 |
+
d = nn.functional.cosine_similarity(a, v)
|
136 |
+
loss = logloss(d.unsqueeze(1), y)
|
137 |
+
|
138 |
+
return loss
|
139 |
+
|
140 |
+
def train(device, model, train_data_loader, test_data_loader, optimizer,
|
141 |
+
checkpoint_dir=None, checkpoint_interval=None, nepochs=None):
|
142 |
+
|
143 |
+
global global_step, global_epoch
|
144 |
+
resumed_step = global_step
|
145 |
+
|
146 |
+
while global_epoch < nepochs:
|
147 |
+
running_loss = 0.
|
148 |
+
prog_bar = tqdm(enumerate(train_data_loader))
|
149 |
+
for step, (x, mel, y) in prog_bar:
|
150 |
+
model.train()
|
151 |
+
optimizer.zero_grad()
|
152 |
+
|
153 |
+
# Transform data to CUDA device
|
154 |
+
x = x.to(device)
|
155 |
+
|
156 |
+
mel = mel.to(device)
|
157 |
+
|
158 |
+
a, v = model(mel, x)
|
159 |
+
y = y.to(device)
|
160 |
+
|
161 |
+
loss = cosine_loss(a, v, y)
|
162 |
+
loss.backward()
|
163 |
+
optimizer.step()
|
164 |
+
|
165 |
+
global_step += 1
|
166 |
+
cur_session_steps = global_step - resumed_step
|
167 |
+
running_loss += loss.item()
|
168 |
+
|
169 |
+
if global_step == 1 or global_step % checkpoint_interval == 0:
|
170 |
+
save_checkpoint(
|
171 |
+
model, optimizer, global_step, checkpoint_dir, global_epoch)
|
172 |
+
|
173 |
+
if global_step % hparams.syncnet_eval_interval == 0:
|
174 |
+
with torch.no_grad():
|
175 |
+
eval_model(test_data_loader, global_step, device, model, checkpoint_dir)
|
176 |
+
|
177 |
+
prog_bar.set_description('Loss: {}'.format(running_loss / (step + 1)))
|
178 |
+
|
179 |
+
global_epoch += 1
|
180 |
+
|
181 |
+
def eval_model(test_data_loader, global_step, device, model, checkpoint_dir):
|
182 |
+
eval_steps = 1400
|
183 |
+
print('Evaluating for {} steps'.format(eval_steps))
|
184 |
+
losses = []
|
185 |
+
while 1:
|
186 |
+
for step, (x, mel, y) in enumerate(test_data_loader):
|
187 |
+
|
188 |
+
model.eval()
|
189 |
+
|
190 |
+
# Transform data to CUDA device
|
191 |
+
x = x.to(device)
|
192 |
+
|
193 |
+
mel = mel.to(device)
|
194 |
+
|
195 |
+
a, v = model(mel, x)
|
196 |
+
y = y.to(device)
|
197 |
+
|
198 |
+
loss = cosine_loss(a, v, y)
|
199 |
+
losses.append(loss.item())
|
200 |
+
|
201 |
+
if step > eval_steps: break
|
202 |
+
|
203 |
+
averaged_loss = sum(losses) / len(losses)
|
204 |
+
print(averaged_loss)
|
205 |
+
|
206 |
+
return
|
207 |
+
|
208 |
+
def save_checkpoint(model, optimizer, step, checkpoint_dir, epoch):
|
209 |
+
|
210 |
+
checkpoint_path = join(
|
211 |
+
checkpoint_dir, "checkpoint_step{:09d}.pth".format(global_step))
|
212 |
+
optimizer_state = optimizer.state_dict() if hparams.save_optimizer_state else None
|
213 |
+
torch.save({
|
214 |
+
"state_dict": model.state_dict(),
|
215 |
+
"optimizer": optimizer_state,
|
216 |
+
"global_step": step,
|
217 |
+
"global_epoch": epoch,
|
218 |
+
}, checkpoint_path)
|
219 |
+
print("Saved checkpoint:", checkpoint_path)
|
220 |
+
|
221 |
+
def _load(checkpoint_path):
|
222 |
+
if use_cuda:
|
223 |
+
checkpoint = torch.load(checkpoint_path)
|
224 |
+
else:
|
225 |
+
checkpoint = torch.load(checkpoint_path,
|
226 |
+
map_location=lambda storage, loc: storage)
|
227 |
+
return checkpoint
|
228 |
+
|
229 |
+
def load_checkpoint(path, model, optimizer, reset_optimizer=False):
|
230 |
+
global global_step
|
231 |
+
global global_epoch
|
232 |
+
|
233 |
+
print("Load checkpoint from: {}".format(path))
|
234 |
+
checkpoint = _load(path)
|
235 |
+
model.load_state_dict(checkpoint["state_dict"])
|
236 |
+
if not reset_optimizer:
|
237 |
+
optimizer_state = checkpoint["optimizer"]
|
238 |
+
if optimizer_state is not None:
|
239 |
+
print("Load optimizer state from {}".format(path))
|
240 |
+
optimizer.load_state_dict(checkpoint["optimizer"])
|
241 |
+
global_step = checkpoint["global_step"]
|
242 |
+
global_epoch = checkpoint["global_epoch"]
|
243 |
+
|
244 |
+
return model
|
245 |
+
|
246 |
+
if __name__ == "__main__":
|
247 |
+
checkpoint_dir = args.checkpoint_dir
|
248 |
+
checkpoint_path = args.checkpoint_path
|
249 |
+
|
250 |
+
if not os.path.exists(checkpoint_dir): os.mkdir(checkpoint_dir)
|
251 |
+
|
252 |
+
# Dataset and Dataloader setup
|
253 |
+
train_dataset = Dataset('train')
|
254 |
+
test_dataset = Dataset('val')
|
255 |
+
|
256 |
+
train_data_loader = data_utils.DataLoader(
|
257 |
+
train_dataset, batch_size=hparams.syncnet_batch_size, shuffle=True,
|
258 |
+
num_workers=hparams.num_workers)
|
259 |
+
|
260 |
+
test_data_loader = data_utils.DataLoader(
|
261 |
+
test_dataset, batch_size=hparams.syncnet_batch_size,
|
262 |
+
num_workers=8)
|
263 |
+
|
264 |
+
device = torch.device("cuda" if use_cuda else "cpu")
|
265 |
+
|
266 |
+
# Model
|
267 |
+
model = SyncNet().to(device)
|
268 |
+
print('total trainable params {}'.format(sum(p.numel() for p in model.parameters() if p.requires_grad)))
|
269 |
+
|
270 |
+
optimizer = optim.Adam([p for p in model.parameters() if p.requires_grad],
|
271 |
+
lr=hparams.syncnet_lr)
|
272 |
+
|
273 |
+
if checkpoint_path is not None:
|
274 |
+
load_checkpoint(checkpoint_path, model, optimizer, reset_optimizer=False)
|
275 |
+
|
276 |
+
train(device, model, train_data_loader, test_data_loader, optimizer,
|
277 |
+
checkpoint_dir=checkpoint_dir,
|
278 |
+
checkpoint_interval=hparams.syncnet_checkpoint_interval,
|
279 |
+
nepochs=hparams.nepochs)
|
evaluation/README.md
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Novel Evaluation Framework, new filelists, and using the LSE-D and LSE-C metric.
|
2 |
+
|
3 |
+
Our paper also proposes a novel evaluation framework (Section 4). To evaluate on LRS2, LRS3, and LRW, the filelists are present in the `test_filelists` folder. Please use `gen_videos_from_filelist.py` script to generate the videos. After that, you can calculate the LSE-D and LSE-C scores using the instructions below. Please see [this thread](https://github.com/Rudrabha/Wav2Lip/issues/22#issuecomment-712825380) on how to calculate the FID scores.
|
4 |
+
|
5 |
+
The videos of the ReSyncED benchmark for real-world evaluation will be released soon.
|
6 |
+
|
7 |
+
### Steps to set-up the evaluation repository for LSE-D and LSE-C metric:
|
8 |
+
We use the pre-trained syncnet model available in this [repository](https://github.com/joonson/syncnet_python).
|
9 |
+
|
10 |
+
* Clone the SyncNet repository.
|
11 |
+
```
|
12 |
+
git clone https://github.com/joonson/syncnet_python.git
|
13 |
+
```
|
14 |
+
* Follow the procedure given in the above linked [repository](https://github.com/joonson/syncnet_python) to download the pretrained models and set up the dependencies.
|
15 |
+
* **Note: Please install a separate virtual environment for the evaluation scripts. The versions used by Wav2Lip and the publicly released code of SyncNet is different and can cause version mis-match issues. To avoid this, we suggest the users to install a separate virtual environment for the evaluation scripts**
|
16 |
+
```
|
17 |
+
cd syncnet_python
|
18 |
+
pip install -r requirements.txt
|
19 |
+
sh download_model.sh
|
20 |
+
```
|
21 |
+
* The above step should ensure that all the dependencies required by the repository is installed and the pre-trained models are downloaded.
|
22 |
+
|
23 |
+
### Running the evaluation scripts:
|
24 |
+
* Copy our evaluation scripts given in this folder to the cloned repository.
|
25 |
+
```
|
26 |
+
cd Wav2Lip/evaluation/scores_LSE/
|
27 |
+
cp *.py syncnet_python/
|
28 |
+
cp *.sh syncnet_python/
|
29 |
+
```
|
30 |
+
**Note: We will release the test filelists for LRW, LRS2 and LRS3 shortly once we receive permission from the dataset creators. We will also release the Real World Dataset we have collected shortly.**
|
31 |
+
|
32 |
+
* Our evaluation technique does not require ground-truth of any sorts. Given lip-synced videos we can directly calculate the scores from only the generated videos. Please store the generated videos (from our test sets or your own generated videos) in the following folder structure.
|
33 |
+
```
|
34 |
+
video data root (Folder containing all videos)
|
35 |
+
├── All .mp4 files
|
36 |
+
```
|
37 |
+
* Change the folder back to the cloned repository.
|
38 |
+
```
|
39 |
+
cd syncnet_python
|
40 |
+
```
|
41 |
+
* To run evaluation on the LRW, LRS2 and LRS3 test files, please run the following command:
|
42 |
+
```
|
43 |
+
python calculate_scores_LRS.py --data_root /path/to/video/data/root --tmp_dir tmp_dir/
|
44 |
+
```
|
45 |
+
|
46 |
+
* To run evaluation on the ReSynced dataset or your own generated videos, please run the following command:
|
47 |
+
```
|
48 |
+
sh calculate_scores_real_videos.sh /path/to/video/data/root
|
49 |
+
```
|
50 |
+
* The generated scores will be present in the all_scores.txt generated in the ```syncnet_python/``` folder
|
51 |
+
|
52 |
+
# Evaluation of image quality using FID metric.
|
53 |
+
We use the [pytorch-fid](https://github.com/mseitzer/pytorch-fid) repository for calculating the FID metrics. We dump all the frames in both ground-truth and generated videos and calculate the FID score.
|
54 |
+
|
55 |
+
|
56 |
+
# Opening issues related to evaluation scripts
|
57 |
+
* Please open the issues with the "Evaluation" label if you face any issues in the evaluation scripts.
|
58 |
+
|
59 |
+
# Acknowledgements
|
60 |
+
Our evaluation pipeline in based on two existing repositories. LSE metrics are based on the [syncnet_python](https://github.com/joonson/syncnet_python) repository and the FID score is based on [pytorch-fid](https://github.com/mseitzer/pytorch-fid) repository. We thank the authors of both the repositories for releasing their wonderful code.
|
61 |
+
|
62 |
+
|
63 |
+
|
evaluation/gen_videos_from_filelist.py
ADDED
@@ -0,0 +1,238 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from os import listdir, path
|
2 |
+
import numpy as np
|
3 |
+
import scipy, cv2, os, sys, argparse
|
4 |
+
import dlib, json, subprocess
|
5 |
+
from tqdm import tqdm
|
6 |
+
from glob import glob
|
7 |
+
import torch
|
8 |
+
|
9 |
+
sys.path.append('../')
|
10 |
+
import audio
|
11 |
+
import face_detection
|
12 |
+
from models import Wav2Lip
|
13 |
+
|
14 |
+
parser = argparse.ArgumentParser(description='Code to generate results for test filelists')
|
15 |
+
|
16 |
+
parser.add_argument('--filelist', type=str,
|
17 |
+
help='Filepath of filelist file to read', required=True)
|
18 |
+
parser.add_argument('--results_dir', type=str, help='Folder to save all results into',
|
19 |
+
required=True)
|
20 |
+
parser.add_argument('--data_root', type=str, required=True)
|
21 |
+
parser.add_argument('--checkpoint_path', type=str,
|
22 |
+
help='Name of saved checkpoint to load weights from', required=True)
|
23 |
+
|
24 |
+
parser.add_argument('--pads', nargs='+', type=int, default=[0, 0, 0, 0],
|
25 |
+
help='Padding (top, bottom, left, right)')
|
26 |
+
parser.add_argument('--face_det_batch_size', type=int,
|
27 |
+
help='Single GPU batch size for face detection', default=64)
|
28 |
+
parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip', default=128)
|
29 |
+
|
30 |
+
# parser.add_argument('--resize_factor', default=1, type=int)
|
31 |
+
|
32 |
+
args = parser.parse_args()
|
33 |
+
args.img_size = 96
|
34 |
+
|
35 |
+
def get_smoothened_boxes(boxes, T):
|
36 |
+
for i in range(len(boxes)):
|
37 |
+
if i + T > len(boxes):
|
38 |
+
window = boxes[len(boxes) - T:]
|
39 |
+
else:
|
40 |
+
window = boxes[i : i + T]
|
41 |
+
boxes[i] = np.mean(window, axis=0)
|
42 |
+
return boxes
|
43 |
+
|
44 |
+
def face_detect(images):
|
45 |
+
batch_size = args.face_det_batch_size
|
46 |
+
|
47 |
+
while 1:
|
48 |
+
predictions = []
|
49 |
+
try:
|
50 |
+
for i in range(0, len(images), batch_size):
|
51 |
+
predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
|
52 |
+
except RuntimeError:
|
53 |
+
if batch_size == 1:
|
54 |
+
raise RuntimeError('Image too big to run face detection on GPU')
|
55 |
+
batch_size //= 2
|
56 |
+
args.face_det_batch_size = batch_size
|
57 |
+
print('Recovering from OOM error; New batch size: {}'.format(batch_size))
|
58 |
+
continue
|
59 |
+
break
|
60 |
+
|
61 |
+
results = []
|
62 |
+
pady1, pady2, padx1, padx2 = args.pads
|
63 |
+
for rect, image in zip(predictions, images):
|
64 |
+
if rect is None:
|
65 |
+
raise ValueError('Face not detected!')
|
66 |
+
|
67 |
+
y1 = max(0, rect[1] - pady1)
|
68 |
+
y2 = min(image.shape[0], rect[3] + pady2)
|
69 |
+
x1 = max(0, rect[0] - padx1)
|
70 |
+
x2 = min(image.shape[1], rect[2] + padx2)
|
71 |
+
|
72 |
+
results.append([x1, y1, x2, y2])
|
73 |
+
|
74 |
+
boxes = get_smoothened_boxes(np.array(results), T=5)
|
75 |
+
results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2), True] for image, (x1, y1, x2, y2) in zip(images, boxes)]
|
76 |
+
|
77 |
+
return results
|
78 |
+
|
79 |
+
def datagen(frames, face_det_results, mels):
|
80 |
+
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
|
81 |
+
|
82 |
+
for i, m in enumerate(mels):
|
83 |
+
if i >= len(frames): raise ValueError('Equal or less lengths only')
|
84 |
+
|
85 |
+
frame_to_save = frames[i].copy()
|
86 |
+
face, coords, valid_frame = face_det_results[i].copy()
|
87 |
+
if not valid_frame:
|
88 |
+
continue
|
89 |
+
|
90 |
+
face = cv2.resize(face, (args.img_size, args.img_size))
|
91 |
+
|
92 |
+
img_batch.append(face)
|
93 |
+
mel_batch.append(m)
|
94 |
+
frame_batch.append(frame_to_save)
|
95 |
+
coords_batch.append(coords)
|
96 |
+
|
97 |
+
if len(img_batch) >= args.wav2lip_batch_size:
|
98 |
+
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
99 |
+
|
100 |
+
img_masked = img_batch.copy()
|
101 |
+
img_masked[:, args.img_size//2:] = 0
|
102 |
+
|
103 |
+
img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
|
104 |
+
mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
|
105 |
+
|
106 |
+
yield img_batch, mel_batch, frame_batch, coords_batch
|
107 |
+
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
|
108 |
+
|
109 |
+
if len(img_batch) > 0:
|
110 |
+
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
111 |
+
|
112 |
+
img_masked = img_batch.copy()
|
113 |
+
img_masked[:, args.img_size//2:] = 0
|
114 |
+
|
115 |
+
img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
|
116 |
+
mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
|
117 |
+
|
118 |
+
yield img_batch, mel_batch, frame_batch, coords_batch
|
119 |
+
|
120 |
+
fps = 25
|
121 |
+
mel_step_size = 16
|
122 |
+
mel_idx_multiplier = 80./fps
|
123 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
124 |
+
print('Using {} for inference.'.format(device))
|
125 |
+
|
126 |
+
detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
|
127 |
+
flip_input=False, device=device)
|
128 |
+
|
129 |
+
def _load(checkpoint_path):
|
130 |
+
if device == 'cuda':
|
131 |
+
checkpoint = torch.load(checkpoint_path)
|
132 |
+
else:
|
133 |
+
checkpoint = torch.load(checkpoint_path,
|
134 |
+
map_location=lambda storage, loc: storage)
|
135 |
+
return checkpoint
|
136 |
+
|
137 |
+
def load_model(path):
|
138 |
+
model = Wav2Lip()
|
139 |
+
print("Load checkpoint from: {}".format(path))
|
140 |
+
checkpoint = _load(path)
|
141 |
+
s = checkpoint["state_dict"]
|
142 |
+
new_s = {}
|
143 |
+
for k, v in s.items():
|
144 |
+
new_s[k.replace('module.', '')] = v
|
145 |
+
model.load_state_dict(new_s)
|
146 |
+
|
147 |
+
model = model.to(device)
|
148 |
+
return model.eval()
|
149 |
+
|
150 |
+
model = load_model(args.checkpoint_path)
|
151 |
+
|
152 |
+
def main():
|
153 |
+
assert args.data_root is not None
|
154 |
+
data_root = args.data_root
|
155 |
+
|
156 |
+
if not os.path.isdir(args.results_dir): os.makedirs(args.results_dir)
|
157 |
+
|
158 |
+
with open(args.filelist, 'r') as filelist:
|
159 |
+
lines = filelist.readlines()
|
160 |
+
|
161 |
+
for idx, line in enumerate(tqdm(lines)):
|
162 |
+
audio_src, video = line.strip().split()
|
163 |
+
|
164 |
+
audio_src = os.path.join(data_root, audio_src) + '.mp4'
|
165 |
+
video = os.path.join(data_root, video) + '.mp4'
|
166 |
+
|
167 |
+
command = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'.format(audio_src, '../temp/temp.wav')
|
168 |
+
subprocess.call(command, shell=True)
|
169 |
+
temp_audio = '../temp/temp.wav'
|
170 |
+
|
171 |
+
wav = audio.load_wav(temp_audio, 16000)
|
172 |
+
mel = audio.melspectrogram(wav)
|
173 |
+
if np.isnan(mel.reshape(-1)).sum() > 0:
|
174 |
+
continue
|
175 |
+
|
176 |
+
mel_chunks = []
|
177 |
+
i = 0
|
178 |
+
while 1:
|
179 |
+
start_idx = int(i * mel_idx_multiplier)
|
180 |
+
if start_idx + mel_step_size > len(mel[0]):
|
181 |
+
break
|
182 |
+
mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
|
183 |
+
i += 1
|
184 |
+
|
185 |
+
video_stream = cv2.VideoCapture(video)
|
186 |
+
|
187 |
+
full_frames = []
|
188 |
+
while 1:
|
189 |
+
still_reading, frame = video_stream.read()
|
190 |
+
if not still_reading or len(full_frames) > len(mel_chunks):
|
191 |
+
video_stream.release()
|
192 |
+
break
|
193 |
+
full_frames.append(frame)
|
194 |
+
|
195 |
+
if len(full_frames) < len(mel_chunks):
|
196 |
+
continue
|
197 |
+
|
198 |
+
full_frames = full_frames[:len(mel_chunks)]
|
199 |
+
|
200 |
+
try:
|
201 |
+
face_det_results = face_detect(full_frames.copy())
|
202 |
+
except ValueError as e:
|
203 |
+
continue
|
204 |
+
|
205 |
+
batch_size = args.wav2lip_batch_size
|
206 |
+
gen = datagen(full_frames.copy(), face_det_results, mel_chunks)
|
207 |
+
|
208 |
+
for i, (img_batch, mel_batch, frames, coords) in enumerate(gen):
|
209 |
+
if i == 0:
|
210 |
+
frame_h, frame_w = full_frames[0].shape[:-1]
|
211 |
+
out = cv2.VideoWriter('../temp/result.avi',
|
212 |
+
cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
|
213 |
+
|
214 |
+
img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
|
215 |
+
mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
|
216 |
+
|
217 |
+
with torch.no_grad():
|
218 |
+
pred = model(mel_batch, img_batch)
|
219 |
+
|
220 |
+
|
221 |
+
pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
|
222 |
+
|
223 |
+
for pl, f, c in zip(pred, frames, coords):
|
224 |
+
y1, y2, x1, x2 = c
|
225 |
+
pl = cv2.resize(pl.astype(np.uint8), (x2 - x1, y2 - y1))
|
226 |
+
f[y1:y2, x1:x2] = pl
|
227 |
+
out.write(f)
|
228 |
+
|
229 |
+
out.release()
|
230 |
+
|
231 |
+
vid = os.path.join(args.results_dir, '{}.mp4'.format(idx))
|
232 |
+
|
233 |
+
command = 'ffmpeg -loglevel panic -y -i {} -i {} -strict -2 -q:v 1 {}'.format(temp_audio,
|
234 |
+
'../temp/result.avi', vid)
|
235 |
+
subprocess.call(command, shell=True)
|
236 |
+
|
237 |
+
if __name__ == '__main__':
|
238 |
+
main()
|
evaluation/real_videos_inference.py
ADDED
@@ -0,0 +1,305 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from os import listdir, path
|
2 |
+
import numpy as np
|
3 |
+
import scipy, cv2, os, sys, argparse
|
4 |
+
import dlib, json, subprocess
|
5 |
+
from tqdm import tqdm
|
6 |
+
from glob import glob
|
7 |
+
import torch
|
8 |
+
|
9 |
+
sys.path.append('../')
|
10 |
+
import audio
|
11 |
+
import face_detection
|
12 |
+
from models import Wav2Lip
|
13 |
+
|
14 |
+
parser = argparse.ArgumentParser(description='Code to generate results on ReSyncED evaluation set')
|
15 |
+
|
16 |
+
parser.add_argument('--mode', type=str,
|
17 |
+
help='random | dubbed | tts', required=True)
|
18 |
+
|
19 |
+
parser.add_argument('--filelist', type=str,
|
20 |
+
help='Filepath of filelist file to read', default=None)
|
21 |
+
|
22 |
+
parser.add_argument('--results_dir', type=str, help='Folder to save all results into',
|
23 |
+
required=True)
|
24 |
+
parser.add_argument('--data_root', type=str, required=True)
|
25 |
+
parser.add_argument('--checkpoint_path', type=str,
|
26 |
+
help='Name of saved checkpoint to load weights from', required=True)
|
27 |
+
parser.add_argument('--pads', nargs='+', type=int, default=[0, 10, 0, 0],
|
28 |
+
help='Padding (top, bottom, left, right)')
|
29 |
+
|
30 |
+
parser.add_argument('--face_det_batch_size', type=int,
|
31 |
+
help='Single GPU batch size for face detection', default=16)
|
32 |
+
|
33 |
+
parser.add_argument('--wav2lip_batch_size', type=int, help='Batch size for Wav2Lip', default=128)
|
34 |
+
parser.add_argument('--face_res', help='Approximate resolution of the face at which to test', default=180)
|
35 |
+
parser.add_argument('--min_frame_res', help='Do not downsample further below this frame resolution', default=480)
|
36 |
+
parser.add_argument('--max_frame_res', help='Downsample to at least this frame resolution', default=720)
|
37 |
+
# parser.add_argument('--resize_factor', default=1, type=int)
|
38 |
+
|
39 |
+
args = parser.parse_args()
|
40 |
+
args.img_size = 96
|
41 |
+
|
42 |
+
def get_smoothened_boxes(boxes, T):
|
43 |
+
for i in range(len(boxes)):
|
44 |
+
if i + T > len(boxes):
|
45 |
+
window = boxes[len(boxes) - T:]
|
46 |
+
else:
|
47 |
+
window = boxes[i : i + T]
|
48 |
+
boxes[i] = np.mean(window, axis=0)
|
49 |
+
return boxes
|
50 |
+
|
51 |
+
def rescale_frames(images):
|
52 |
+
rect = detector.get_detections_for_batch(np.array([images[0]]))[0]
|
53 |
+
if rect is None:
|
54 |
+
raise ValueError('Face not detected!')
|
55 |
+
h, w = images[0].shape[:-1]
|
56 |
+
|
57 |
+
x1, y1, x2, y2 = rect
|
58 |
+
|
59 |
+
face_size = max(np.abs(y1 - y2), np.abs(x1 - x2))
|
60 |
+
|
61 |
+
diff = np.abs(face_size - args.face_res)
|
62 |
+
for factor in range(2, 16):
|
63 |
+
downsampled_res = face_size // factor
|
64 |
+
if min(h//factor, w//factor) < args.min_frame_res: break
|
65 |
+
if np.abs(downsampled_res - args.face_res) >= diff: break
|
66 |
+
|
67 |
+
factor -= 1
|
68 |
+
if factor == 1: return images
|
69 |
+
|
70 |
+
return [cv2.resize(im, (im.shape[1]//(factor), im.shape[0]//(factor))) for im in images]
|
71 |
+
|
72 |
+
|
73 |
+
def face_detect(images):
|
74 |
+
batch_size = args.face_det_batch_size
|
75 |
+
images = rescale_frames(images)
|
76 |
+
|
77 |
+
while 1:
|
78 |
+
predictions = []
|
79 |
+
try:
|
80 |
+
for i in range(0, len(images), batch_size):
|
81 |
+
predictions.extend(detector.get_detections_for_batch(np.array(images[i:i + batch_size])))
|
82 |
+
except RuntimeError:
|
83 |
+
if batch_size == 1:
|
84 |
+
raise RuntimeError('Image too big to run face detection on GPU')
|
85 |
+
batch_size //= 2
|
86 |
+
print('Recovering from OOM error; New batch size: {}'.format(batch_size))
|
87 |
+
continue
|
88 |
+
break
|
89 |
+
|
90 |
+
results = []
|
91 |
+
pady1, pady2, padx1, padx2 = args.pads
|
92 |
+
for rect, image in zip(predictions, images):
|
93 |
+
if rect is None:
|
94 |
+
raise ValueError('Face not detected!')
|
95 |
+
|
96 |
+
y1 = max(0, rect[1] - pady1)
|
97 |
+
y2 = min(image.shape[0], rect[3] + pady2)
|
98 |
+
x1 = max(0, rect[0] - padx1)
|
99 |
+
x2 = min(image.shape[1], rect[2] + padx2)
|
100 |
+
|
101 |
+
results.append([x1, y1, x2, y2])
|
102 |
+
|
103 |
+
boxes = get_smoothened_boxes(np.array(results), T=5)
|
104 |
+
results = [[image[y1: y2, x1:x2], (y1, y2, x1, x2), True] for image, (x1, y1, x2, y2) in zip(images, boxes)]
|
105 |
+
|
106 |
+
return results, images
|
107 |
+
|
108 |
+
def datagen(frames, face_det_results, mels):
|
109 |
+
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
|
110 |
+
|
111 |
+
for i, m in enumerate(mels):
|
112 |
+
if i >= len(frames): raise ValueError('Equal or less lengths only')
|
113 |
+
|
114 |
+
frame_to_save = frames[i].copy()
|
115 |
+
face, coords, valid_frame = face_det_results[i].copy()
|
116 |
+
if not valid_frame:
|
117 |
+
continue
|
118 |
+
|
119 |
+
face = cv2.resize(face, (args.img_size, args.img_size))
|
120 |
+
|
121 |
+
img_batch.append(face)
|
122 |
+
mel_batch.append(m)
|
123 |
+
frame_batch.append(frame_to_save)
|
124 |
+
coords_batch.append(coords)
|
125 |
+
|
126 |
+
if len(img_batch) >= args.wav2lip_batch_size:
|
127 |
+
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
128 |
+
|
129 |
+
img_masked = img_batch.copy()
|
130 |
+
img_masked[:, args.img_size//2:] = 0
|
131 |
+
|
132 |
+
img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
|
133 |
+
mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
|
134 |
+
|
135 |
+
yield img_batch, mel_batch, frame_batch, coords_batch
|
136 |
+
img_batch, mel_batch, frame_batch, coords_batch = [], [], [], []
|
137 |
+
|
138 |
+
if len(img_batch) > 0:
|
139 |
+
img_batch, mel_batch = np.asarray(img_batch), np.asarray(mel_batch)
|
140 |
+
|
141 |
+
img_masked = img_batch.copy()
|
142 |
+
img_masked[:, args.img_size//2:] = 0
|
143 |
+
|
144 |
+
img_batch = np.concatenate((img_masked, img_batch), axis=3) / 255.
|
145 |
+
mel_batch = np.reshape(mel_batch, [len(mel_batch), mel_batch.shape[1], mel_batch.shape[2], 1])
|
146 |
+
|
147 |
+
yield img_batch, mel_batch, frame_batch, coords_batch
|
148 |
+
|
149 |
+
def increase_frames(frames, l):
|
150 |
+
## evenly duplicating frames to increase length of video
|
151 |
+
while len(frames) < l:
|
152 |
+
dup_every = float(l) / len(frames)
|
153 |
+
|
154 |
+
final_frames = []
|
155 |
+
next_duplicate = 0.
|
156 |
+
|
157 |
+
for i, f in enumerate(frames):
|
158 |
+
final_frames.append(f)
|
159 |
+
|
160 |
+
if int(np.ceil(next_duplicate)) == i:
|
161 |
+
final_frames.append(f)
|
162 |
+
|
163 |
+
next_duplicate += dup_every
|
164 |
+
|
165 |
+
frames = final_frames
|
166 |
+
|
167 |
+
return frames[:l]
|
168 |
+
|
169 |
+
mel_step_size = 16
|
170 |
+
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
171 |
+
print('Using {} for inference.'.format(device))
|
172 |
+
|
173 |
+
detector = face_detection.FaceAlignment(face_detection.LandmarksType._2D,
|
174 |
+
flip_input=False, device=device)
|
175 |
+
|
176 |
+
def _load(checkpoint_path):
|
177 |
+
if device == 'cuda':
|
178 |
+
checkpoint = torch.load(checkpoint_path)
|
179 |
+
else:
|
180 |
+
checkpoint = torch.load(checkpoint_path,
|
181 |
+
map_location=lambda storage, loc: storage)
|
182 |
+
return checkpoint
|
183 |
+
|
184 |
+
def load_model(path):
|
185 |
+
model = Wav2Lip()
|
186 |
+
print("Load checkpoint from: {}".format(path))
|
187 |
+
checkpoint = _load(path)
|
188 |
+
s = checkpoint["state_dict"]
|
189 |
+
new_s = {}
|
190 |
+
for k, v in s.items():
|
191 |
+
new_s[k.replace('module.', '')] = v
|
192 |
+
model.load_state_dict(new_s)
|
193 |
+
|
194 |
+
model = model.to(device)
|
195 |
+
return model.eval()
|
196 |
+
|
197 |
+
model = load_model(args.checkpoint_path)
|
198 |
+
|
199 |
+
def main():
|
200 |
+
if not os.path.isdir(args.results_dir): os.makedirs(args.results_dir)
|
201 |
+
|
202 |
+
if args.mode == 'dubbed':
|
203 |
+
files = listdir(args.data_root)
|
204 |
+
lines = ['{} {}'.format(f, f) for f in files]
|
205 |
+
|
206 |
+
else:
|
207 |
+
assert args.filelist is not None
|
208 |
+
with open(args.filelist, 'r') as filelist:
|
209 |
+
lines = filelist.readlines()
|
210 |
+
|
211 |
+
for idx, line in enumerate(tqdm(lines)):
|
212 |
+
video, audio_src = line.strip().split()
|
213 |
+
|
214 |
+
audio_src = os.path.join(args.data_root, audio_src)
|
215 |
+
video = os.path.join(args.data_root, video)
|
216 |
+
|
217 |
+
command = 'ffmpeg -loglevel panic -y -i {} -strict -2 {}'.format(audio_src, '../temp/temp.wav')
|
218 |
+
subprocess.call(command, shell=True)
|
219 |
+
temp_audio = '../temp/temp.wav'
|
220 |
+
|
221 |
+
wav = audio.load_wav(temp_audio, 16000)
|
222 |
+
mel = audio.melspectrogram(wav)
|
223 |
+
|
224 |
+
if np.isnan(mel.reshape(-1)).sum() > 0:
|
225 |
+
raise ValueError('Mel contains nan!')
|
226 |
+
|
227 |
+
video_stream = cv2.VideoCapture(video)
|
228 |
+
|
229 |
+
fps = video_stream.get(cv2.CAP_PROP_FPS)
|
230 |
+
mel_idx_multiplier = 80./fps
|
231 |
+
|
232 |
+
full_frames = []
|
233 |
+
while 1:
|
234 |
+
still_reading, frame = video_stream.read()
|
235 |
+
if not still_reading:
|
236 |
+
video_stream.release()
|
237 |
+
break
|
238 |
+
|
239 |
+
if min(frame.shape[:-1]) > args.max_frame_res:
|
240 |
+
h, w = frame.shape[:-1]
|
241 |
+
scale_factor = min(h, w) / float(args.max_frame_res)
|
242 |
+
h = int(h/scale_factor)
|
243 |
+
w = int(w/scale_factor)
|
244 |
+
|
245 |
+
frame = cv2.resize(frame, (w, h))
|
246 |
+
full_frames.append(frame)
|
247 |
+
|
248 |
+
mel_chunks = []
|
249 |
+
i = 0
|
250 |
+
while 1:
|
251 |
+
start_idx = int(i * mel_idx_multiplier)
|
252 |
+
if start_idx + mel_step_size > len(mel[0]):
|
253 |
+
break
|
254 |
+
mel_chunks.append(mel[:, start_idx : start_idx + mel_step_size])
|
255 |
+
i += 1
|
256 |
+
|
257 |
+
if len(full_frames) < len(mel_chunks):
|
258 |
+
if args.mode == 'tts':
|
259 |
+
full_frames = increase_frames(full_frames, len(mel_chunks))
|
260 |
+
else:
|
261 |
+
raise ValueError('#Frames, audio length mismatch')
|
262 |
+
|
263 |
+
else:
|
264 |
+
full_frames = full_frames[:len(mel_chunks)]
|
265 |
+
|
266 |
+
try:
|
267 |
+
face_det_results, full_frames = face_detect(full_frames.copy())
|
268 |
+
except ValueError as e:
|
269 |
+
continue
|
270 |
+
|
271 |
+
batch_size = args.wav2lip_batch_size
|
272 |
+
gen = datagen(full_frames.copy(), face_det_results, mel_chunks)
|
273 |
+
|
274 |
+
for i, (img_batch, mel_batch, frames, coords) in enumerate(gen):
|
275 |
+
if i == 0:
|
276 |
+
frame_h, frame_w = full_frames[0].shape[:-1]
|
277 |
+
|
278 |
+
out = cv2.VideoWriter('../temp/result.avi',
|
279 |
+
cv2.VideoWriter_fourcc(*'DIVX'), fps, (frame_w, frame_h))
|
280 |
+
|
281 |
+
img_batch = torch.FloatTensor(np.transpose(img_batch, (0, 3, 1, 2))).to(device)
|
282 |
+
mel_batch = torch.FloatTensor(np.transpose(mel_batch, (0, 3, 1, 2))).to(device)
|
283 |
+
|
284 |
+
with torch.no_grad():
|
285 |
+
pred = model(mel_batch, img_batch)
|
286 |
+
|
287 |
+
|
288 |
+
pred = pred.cpu().numpy().transpose(0, 2, 3, 1) * 255.
|
289 |
+
|
290 |
+
for pl, f, c in zip(pred, frames, coords):
|
291 |
+
y1, y2, x1, x2 = c
|
292 |
+
pl = cv2.resize(pl.astype(np.uint8), (x2 - x1, y2 - y1))
|
293 |
+
f[y1:y2, x1:x2] = pl
|
294 |
+
out.write(f)
|
295 |
+
|
296 |
+
out.release()
|
297 |
+
|
298 |
+
vid = os.path.join(args.results_dir, '{}.mp4'.format(idx))
|
299 |
+
command = 'ffmpeg -loglevel panic -y -i {} -i {} -strict -2 -q:v 1 {}'.format('../temp/temp.wav',
|
300 |
+
'../temp/result.avi', vid)
|
301 |
+
subprocess.call(command, shell=True)
|
302 |
+
|
303 |
+
|
304 |
+
if __name__ == '__main__':
|
305 |
+
main()
|
evaluation/scores_LSE/SyncNetInstance_calc_scores.py
ADDED
@@ -0,0 +1,210 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python
|
2 |
+
#-*- coding: utf-8 -*-
|
3 |
+
# Video 25 FPS, Audio 16000HZ
|
4 |
+
|
5 |
+
import torch
|
6 |
+
import numpy
|
7 |
+
import time, pdb, argparse, subprocess, os, math, glob
|
8 |
+
import cv2
|
9 |
+
import python_speech_features
|
10 |
+
|
11 |
+
from scipy import signal
|
12 |
+
from scipy.io import wavfile
|
13 |
+
from SyncNetModel import *
|
14 |
+
from shutil import rmtree
|
15 |
+
|
16 |
+
|
17 |
+
# ==================== Get OFFSET ====================
|
18 |
+
|
19 |
+
def calc_pdist(feat1, feat2, vshift=10):
|
20 |
+
|
21 |
+
win_size = vshift*2+1
|
22 |
+
|
23 |
+
feat2p = torch.nn.functional.pad(feat2,(0,0,vshift,vshift))
|
24 |
+
|
25 |
+
dists = []
|
26 |
+
|
27 |
+
for i in range(0,len(feat1)):
|
28 |
+
|
29 |
+
dists.append(torch.nn.functional.pairwise_distance(feat1[[i],:].repeat(win_size, 1), feat2p[i:i+win_size,:]))
|
30 |
+
|
31 |
+
return dists
|
32 |
+
|
33 |
+
# ==================== MAIN DEF ====================
|
34 |
+
|
35 |
+
class SyncNetInstance(torch.nn.Module):
|
36 |
+
|
37 |
+
def __init__(self, dropout = 0, num_layers_in_fc_layers = 1024):
|
38 |
+
super(SyncNetInstance, self).__init__();
|
39 |
+
|
40 |
+
self.__S__ = S(num_layers_in_fc_layers = num_layers_in_fc_layers).cuda();
|
41 |
+
|
42 |
+
def evaluate(self, opt, videofile):
|
43 |
+
|
44 |
+
self.__S__.eval();
|
45 |
+
|
46 |
+
# ========== ==========
|
47 |
+
# Convert files
|
48 |
+
# ========== ==========
|
49 |
+
|
50 |
+
if os.path.exists(os.path.join(opt.tmp_dir,opt.reference)):
|
51 |
+
rmtree(os.path.join(opt.tmp_dir,opt.reference))
|
52 |
+
|
53 |
+
os.makedirs(os.path.join(opt.tmp_dir,opt.reference))
|
54 |
+
|
55 |
+
command = ("ffmpeg -loglevel error -y -i %s -threads 1 -f image2 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'%06d.jpg')))
|
56 |
+
output = subprocess.call(command, shell=True, stdout=None)
|
57 |
+
|
58 |
+
command = ("ffmpeg -loglevel error -y -i %s -async 1 -ac 1 -vn -acodec pcm_s16le -ar 16000 %s" % (videofile,os.path.join(opt.tmp_dir,opt.reference,'audio.wav')))
|
59 |
+
output = subprocess.call(command, shell=True, stdout=None)
|
60 |
+
|
61 |
+
# ========== ==========
|
62 |
+
# Load video
|
63 |
+
# ========== ==========
|
64 |
+
|
65 |
+
images = []
|
66 |
+
|
67 |
+
flist = glob.glob(os.path.join(opt.tmp_dir,opt.reference,'*.jpg'))
|
68 |
+
flist.sort()
|
69 |
+
|
70 |
+
for fname in flist:
|
71 |
+
img_input = cv2.imread(fname)
|
72 |
+
img_input = cv2.resize(img_input, (224,224)) #HARD CODED, CHANGE BEFORE RELEASE
|
73 |
+
images.append(img_input)
|
74 |
+
|
75 |
+
im = numpy.stack(images,axis=3)
|
76 |
+
im = numpy.expand_dims(im,axis=0)
|
77 |
+
im = numpy.transpose(im,(0,3,4,1,2))
|
78 |
+
|
79 |
+
imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
|
80 |
+
|
81 |
+
# ========== ==========
|
82 |
+
# Load audio
|
83 |
+
# ========== ==========
|
84 |
+
|
85 |
+
sample_rate, audio = wavfile.read(os.path.join(opt.tmp_dir,opt.reference,'audio.wav'))
|
86 |
+
mfcc = zip(*python_speech_features.mfcc(audio,sample_rate))
|
87 |
+
mfcc = numpy.stack([numpy.array(i) for i in mfcc])
|
88 |
+
|
89 |
+
cc = numpy.expand_dims(numpy.expand_dims(mfcc,axis=0),axis=0)
|
90 |
+
cct = torch.autograd.Variable(torch.from_numpy(cc.astype(float)).float())
|
91 |
+
|
92 |
+
# ========== ==========
|
93 |
+
# Check audio and video input length
|
94 |
+
# ========== ==========
|
95 |
+
|
96 |
+
#if (float(len(audio))/16000) != (float(len(images))/25) :
|
97 |
+
# print("WARNING: Audio (%.4fs) and video (%.4fs) lengths are different."%(float(len(audio))/16000,float(len(images))/25))
|
98 |
+
|
99 |
+
min_length = min(len(images),math.floor(len(audio)/640))
|
100 |
+
|
101 |
+
# ========== ==========
|
102 |
+
# Generate video and audio feats
|
103 |
+
# ========== ==========
|
104 |
+
|
105 |
+
lastframe = min_length-5
|
106 |
+
im_feat = []
|
107 |
+
cc_feat = []
|
108 |
+
|
109 |
+
tS = time.time()
|
110 |
+
for i in range(0,lastframe,opt.batch_size):
|
111 |
+
|
112 |
+
im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
|
113 |
+
im_in = torch.cat(im_batch,0)
|
114 |
+
im_out = self.__S__.forward_lip(im_in.cuda());
|
115 |
+
im_feat.append(im_out.data.cpu())
|
116 |
+
|
117 |
+
cc_batch = [ cct[:,:,:,vframe*4:vframe*4+20] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
|
118 |
+
cc_in = torch.cat(cc_batch,0)
|
119 |
+
cc_out = self.__S__.forward_aud(cc_in.cuda())
|
120 |
+
cc_feat.append(cc_out.data.cpu())
|
121 |
+
|
122 |
+
im_feat = torch.cat(im_feat,0)
|
123 |
+
cc_feat = torch.cat(cc_feat,0)
|
124 |
+
|
125 |
+
# ========== ==========
|
126 |
+
# Compute offset
|
127 |
+
# ========== ==========
|
128 |
+
|
129 |
+
#print('Compute time %.3f sec.' % (time.time()-tS))
|
130 |
+
|
131 |
+
dists = calc_pdist(im_feat,cc_feat,vshift=opt.vshift)
|
132 |
+
mdist = torch.mean(torch.stack(dists,1),1)
|
133 |
+
|
134 |
+
minval, minidx = torch.min(mdist,0)
|
135 |
+
|
136 |
+
offset = opt.vshift-minidx
|
137 |
+
conf = torch.median(mdist) - minval
|
138 |
+
|
139 |
+
fdist = numpy.stack([dist[minidx].numpy() for dist in dists])
|
140 |
+
# fdist = numpy.pad(fdist, (3,3), 'constant', constant_values=15)
|
141 |
+
fconf = torch.median(mdist).numpy() - fdist
|
142 |
+
fconfm = signal.medfilt(fconf,kernel_size=9)
|
143 |
+
|
144 |
+
numpy.set_printoptions(formatter={'float': '{: 0.3f}'.format})
|
145 |
+
#print('Framewise conf: ')
|
146 |
+
#print(fconfm)
|
147 |
+
#print('AV offset: \t%d \nMin dist: \t%.3f\nConfidence: \t%.3f' % (offset,minval,conf))
|
148 |
+
|
149 |
+
dists_npy = numpy.array([ dist.numpy() for dist in dists ])
|
150 |
+
return offset.numpy(), conf.numpy(), minval.numpy()
|
151 |
+
|
152 |
+
def extract_feature(self, opt, videofile):
|
153 |
+
|
154 |
+
self.__S__.eval();
|
155 |
+
|
156 |
+
# ========== ==========
|
157 |
+
# Load video
|
158 |
+
# ========== ==========
|
159 |
+
cap = cv2.VideoCapture(videofile)
|
160 |
+
|
161 |
+
frame_num = 1;
|
162 |
+
images = []
|
163 |
+
while frame_num:
|
164 |
+
frame_num += 1
|
165 |
+
ret, image = cap.read()
|
166 |
+
if ret == 0:
|
167 |
+
break
|
168 |
+
|
169 |
+
images.append(image)
|
170 |
+
|
171 |
+
im = numpy.stack(images,axis=3)
|
172 |
+
im = numpy.expand_dims(im,axis=0)
|
173 |
+
im = numpy.transpose(im,(0,3,4,1,2))
|
174 |
+
|
175 |
+
imtv = torch.autograd.Variable(torch.from_numpy(im.astype(float)).float())
|
176 |
+
|
177 |
+
# ========== ==========
|
178 |
+
# Generate video feats
|
179 |
+
# ========== ==========
|
180 |
+
|
181 |
+
lastframe = len(images)-4
|
182 |
+
im_feat = []
|
183 |
+
|
184 |
+
tS = time.time()
|
185 |
+
for i in range(0,lastframe,opt.batch_size):
|
186 |
+
|
187 |
+
im_batch = [ imtv[:,:,vframe:vframe+5,:,:] for vframe in range(i,min(lastframe,i+opt.batch_size)) ]
|
188 |
+
im_in = torch.cat(im_batch,0)
|
189 |
+
im_out = self.__S__.forward_lipfeat(im_in.cuda());
|
190 |
+
im_feat.append(im_out.data.cpu())
|
191 |
+
|
192 |
+
im_feat = torch.cat(im_feat,0)
|
193 |
+
|
194 |
+
# ========== ==========
|
195 |
+
# Compute offset
|
196 |
+
# ========== ==========
|
197 |
+
|
198 |
+
print('Compute time %.3f sec.' % (time.time()-tS))
|
199 |
+
|
200 |
+
return im_feat
|
201 |
+
|
202 |
+
|
203 |
+
def loadParameters(self, path):
|
204 |
+
loaded_state = torch.load(path, map_location=lambda storage, loc: storage);
|
205 |
+
|
206 |
+
self_state = self.__S__.state_dict();
|
207 |
+
|
208 |
+
for name, param in loaded_state.items():
|
209 |
+
|
210 |
+
self_state[name].copy_(param);
|
evaluation/scores_LSE/calculate_scores_LRS.py
ADDED
@@ -0,0 +1,53 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python
|
2 |
+
#-*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import time, pdb, argparse, subprocess
|
5 |
+
import glob
|
6 |
+
import os
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
from SyncNetInstance_calc_scores import *
|
10 |
+
|
11 |
+
# ==================== LOAD PARAMS ====================
|
12 |
+
|
13 |
+
|
14 |
+
parser = argparse.ArgumentParser(description = "SyncNet");
|
15 |
+
|
16 |
+
parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
|
17 |
+
parser.add_argument('--batch_size', type=int, default='20', help='');
|
18 |
+
parser.add_argument('--vshift', type=int, default='15', help='');
|
19 |
+
parser.add_argument('--data_root', type=str, required=True, help='');
|
20 |
+
parser.add_argument('--tmp_dir', type=str, default="data/work/pytmp", help='');
|
21 |
+
parser.add_argument('--reference', type=str, default="demo", help='');
|
22 |
+
|
23 |
+
opt = parser.parse_args();
|
24 |
+
|
25 |
+
|
26 |
+
# ==================== RUN EVALUATION ====================
|
27 |
+
|
28 |
+
s = SyncNetInstance();
|
29 |
+
|
30 |
+
s.loadParameters(opt.initial_model);
|
31 |
+
#print("Model %s loaded."%opt.initial_model);
|
32 |
+
path = os.path.join(opt.data_root, "*.mp4")
|
33 |
+
|
34 |
+
all_videos = glob.glob(path)
|
35 |
+
|
36 |
+
prog_bar = tqdm(range(len(all_videos)))
|
37 |
+
avg_confidence = 0.
|
38 |
+
avg_min_distance = 0.
|
39 |
+
|
40 |
+
|
41 |
+
for videofile_idx in prog_bar:
|
42 |
+
videofile = all_videos[videofile_idx]
|
43 |
+
offset, confidence, min_distance = s.evaluate(opt, videofile=videofile)
|
44 |
+
avg_confidence += confidence
|
45 |
+
avg_min_distance += min_distance
|
46 |
+
prog_bar.set_description('Avg Confidence: {}, Avg Minimum Dist: {}'.format(round(avg_confidence / (videofile_idx + 1), 3), round(avg_min_distance / (videofile_idx + 1), 3)))
|
47 |
+
prog_bar.refresh()
|
48 |
+
|
49 |
+
print ('Average Confidence: {}'.format(avg_confidence/len(all_videos)))
|
50 |
+
print ('Average Minimum Distance: {}'.format(avg_min_distance/len(all_videos)))
|
51 |
+
|
52 |
+
|
53 |
+
|
evaluation/scores_LSE/calculate_scores_real_videos.py
ADDED
@@ -0,0 +1,45 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/python
|
2 |
+
#-*- coding: utf-8 -*-
|
3 |
+
|
4 |
+
import time, pdb, argparse, subprocess, pickle, os, gzip, glob
|
5 |
+
|
6 |
+
from SyncNetInstance_calc_scores import *
|
7 |
+
|
8 |
+
# ==================== PARSE ARGUMENT ====================
|
9 |
+
|
10 |
+
parser = argparse.ArgumentParser(description = "SyncNet");
|
11 |
+
parser.add_argument('--initial_model', type=str, default="data/syncnet_v2.model", help='');
|
12 |
+
parser.add_argument('--batch_size', type=int, default='20', help='');
|
13 |
+
parser.add_argument('--vshift', type=int, default='15', help='');
|
14 |
+
parser.add_argument('--data_dir', type=str, default='data/work', help='');
|
15 |
+
parser.add_argument('--videofile', type=str, default='', help='');
|
16 |
+
parser.add_argument('--reference', type=str, default='', help='');
|
17 |
+
opt = parser.parse_args();
|
18 |
+
|
19 |
+
setattr(opt,'avi_dir',os.path.join(opt.data_dir,'pyavi'))
|
20 |
+
setattr(opt,'tmp_dir',os.path.join(opt.data_dir,'pytmp'))
|
21 |
+
setattr(opt,'work_dir',os.path.join(opt.data_dir,'pywork'))
|
22 |
+
setattr(opt,'crop_dir',os.path.join(opt.data_dir,'pycrop'))
|
23 |
+
|
24 |
+
|
25 |
+
# ==================== LOAD MODEL AND FILE LIST ====================
|
26 |
+
|
27 |
+
s = SyncNetInstance();
|
28 |
+
|
29 |
+
s.loadParameters(opt.initial_model);
|
30 |
+
#print("Model %s loaded."%opt.initial_model);
|
31 |
+
|
32 |
+
flist = glob.glob(os.path.join(opt.crop_dir,opt.reference,'0*.avi'))
|
33 |
+
flist.sort()
|
34 |
+
|
35 |
+
# ==================== GET OFFSETS ====================
|
36 |
+
|
37 |
+
dists = []
|
38 |
+
for idx, fname in enumerate(flist):
|
39 |
+
offset, conf, dist = s.evaluate(opt,videofile=fname)
|
40 |
+
print (str(dist)+" "+str(conf))
|
41 |
+
|
42 |
+
# ==================== PRINT RESULTS TO FILE ====================
|
43 |
+
|
44 |
+
#with open(os.path.join(opt.work_dir,opt.reference,'activesd.pckl'), 'wb') as fil:
|
45 |
+
# pickle.dump(dists, fil)
|
evaluation/scores_LSE/calculate_scores_real_videos.sh
ADDED
@@ -0,0 +1,8 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
rm all_scores.txt
|
2 |
+
yourfilenames=`ls $1`
|
3 |
+
|
4 |
+
for eachfile in $yourfilenames
|
5 |
+
do
|
6 |
+
python run_pipeline.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir
|
7 |
+
python calculate_scores_real_videos.py --videofile $1/$eachfile --reference wav2lip --data_dir tmp_dir >> all_scores.txt
|
8 |
+
done
|
evaluation/test_filelists/README.md
ADDED
@@ -0,0 +1,13 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
This folder contains the filelists for the new evaluation framework proposed in the paper.
|
2 |
+
|
3 |
+
## Test filelists for LRS2, LRS3, and LRW.
|
4 |
+
|
5 |
+
This folder contains three filelists, each containing a list of names of audio-video pairs from the test sets of LRS2, LRS3, and LRW. The LRS2 and LRW filelists are strictly "Copyright BBC" and can only be used for “non-commercial research by applicants who have an agreement with the BBC to access the Lip Reading in the Wild and/or Lip Reading Sentences in the Wild datasets”. Please follow this link for more details: [https://www.bbc.co.uk/rd/projects/lip-reading-datasets](https://www.bbc.co.uk/rd/projects/lip-reading-datasets).
|
6 |
+
|
7 |
+
|
8 |
+
## ReSynCED benchmark
|
9 |
+
|
10 |
+
The sub-folder `ReSynCED` contains filelists for our own Real-world lip-Sync Evaluation Dataset (ReSyncED).
|
11 |
+
|
12 |
+
|
13 |
+
#### Instructions on how to use the above two filelists are available in the README of the parent folder.
|
evaluation/test_filelists/ReSyncED/random_pairs.txt
ADDED
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
sachin.mp4 emma_cropped.mp4
|
2 |
+
sachin.mp4 mourinho.mp4
|
3 |
+
sachin.mp4 elon.mp4
|
4 |
+
sachin.mp4 messi2.mp4
|
5 |
+
sachin.mp4 cr1.mp4
|
6 |
+
sachin.mp4 sachin.mp4
|
7 |
+
sachin.mp4 sg.mp4
|
8 |
+
sachin.mp4 fergi.mp4
|
9 |
+
sachin.mp4 spanish_lec1.mp4
|
10 |
+
sachin.mp4 bush_small.mp4
|
11 |
+
sachin.mp4 macca_cut.mp4
|
12 |
+
sachin.mp4 ca_cropped.mp4
|
13 |
+
sachin.mp4 lecun.mp4
|
14 |
+
sachin.mp4 spanish_lec0.mp4
|
15 |
+
srk.mp4 emma_cropped.mp4
|
16 |
+
srk.mp4 mourinho.mp4
|
17 |
+
srk.mp4 elon.mp4
|
18 |
+
srk.mp4 messi2.mp4
|
19 |
+
srk.mp4 cr1.mp4
|
20 |
+
srk.mp4 srk.mp4
|
21 |
+
srk.mp4 sachin.mp4
|
22 |
+
srk.mp4 sg.mp4
|
23 |
+
srk.mp4 fergi.mp4
|
24 |
+
srk.mp4 spanish_lec1.mp4
|
25 |
+
srk.mp4 bush_small.mp4
|
26 |
+
srk.mp4 macca_cut.mp4
|
27 |
+
srk.mp4 ca_cropped.mp4
|
28 |
+
srk.mp4 guardiola.mp4
|
29 |
+
srk.mp4 lecun.mp4
|
30 |
+
srk.mp4 spanish_lec0.mp4
|
31 |
+
cr1.mp4 emma_cropped.mp4
|
32 |
+
cr1.mp4 elon.mp4
|
33 |
+
cr1.mp4 messi2.mp4
|
34 |
+
cr1.mp4 cr1.mp4
|
35 |
+
cr1.mp4 spanish_lec1.mp4
|
36 |
+
cr1.mp4 bush_small.mp4
|
37 |
+
cr1.mp4 macca_cut.mp4
|
38 |
+
cr1.mp4 ca_cropped.mp4
|
39 |
+
cr1.mp4 lecun.mp4
|
40 |
+
cr1.mp4 spanish_lec0.mp4
|
41 |
+
macca_cut.mp4 emma_cropped.mp4
|
42 |
+
macca_cut.mp4 elon.mp4
|
43 |
+
macca_cut.mp4 messi2.mp4
|
44 |
+
macca_cut.mp4 spanish_lec1.mp4
|
45 |
+
macca_cut.mp4 macca_cut.mp4
|
46 |
+
macca_cut.mp4 ca_cropped.mp4
|
47 |
+
macca_cut.mp4 spanish_lec0.mp4
|
48 |
+
lecun.mp4 emma_cropped.mp4
|
49 |
+
lecun.mp4 elon.mp4
|
50 |
+
lecun.mp4 messi2.mp4
|
51 |
+
lecun.mp4 spanish_lec1.mp4
|
52 |
+
lecun.mp4 macca_cut.mp4
|
53 |
+
lecun.mp4 ca_cropped.mp4
|
54 |
+
lecun.mp4 lecun.mp4
|
55 |
+
lecun.mp4 spanish_lec0.mp4
|
56 |
+
messi2.mp4 emma_cropped.mp4
|
57 |
+
messi2.mp4 elon.mp4
|
58 |
+
messi2.mp4 messi2.mp4
|
59 |
+
messi2.mp4 spanish_lec1.mp4
|
60 |
+
messi2.mp4 macca_cut.mp4
|
61 |
+
messi2.mp4 ca_cropped.mp4
|
62 |
+
messi2.mp4 spanish_lec0.mp4
|
63 |
+
ca_cropped.mp4 emma_cropped.mp4
|
64 |
+
ca_cropped.mp4 elon.mp4
|
65 |
+
ca_cropped.mp4 spanish_lec1.mp4
|
66 |
+
ca_cropped.mp4 ca_cropped.mp4
|
67 |
+
ca_cropped.mp4 spanish_lec0.mp4
|
68 |
+
spanish_lec1.mp4 spanish_lec1.mp4
|
69 |
+
spanish_lec1.mp4 spanish_lec0.mp4
|
70 |
+
elon.mp4 elon.mp4
|
71 |
+
elon.mp4 spanish_lec1.mp4
|
72 |
+
elon.mp4 spanish_lec0.mp4
|
73 |
+
guardiola.mp4 emma_cropped.mp4
|
74 |
+
guardiola.mp4 mourinho.mp4
|
75 |
+
guardiola.mp4 elon.mp4
|
76 |
+
guardiola.mp4 messi2.mp4
|
77 |
+
guardiola.mp4 cr1.mp4
|
78 |
+
guardiola.mp4 sachin.mp4
|
79 |
+
guardiola.mp4 sg.mp4
|
80 |
+
guardiola.mp4 fergi.mp4
|
81 |
+
guardiola.mp4 spanish_lec1.mp4
|
82 |
+
guardiola.mp4 bush_small.mp4
|
83 |
+
guardiola.mp4 macca_cut.mp4
|
84 |
+
guardiola.mp4 ca_cropped.mp4
|
85 |
+
guardiola.mp4 guardiola.mp4
|
86 |
+
guardiola.mp4 lecun.mp4
|
87 |
+
guardiola.mp4 spanish_lec0.mp4
|
88 |
+
fergi.mp4 emma_cropped.mp4
|
89 |
+
fergi.mp4 mourinho.mp4
|
90 |
+
fergi.mp4 elon.mp4
|
91 |
+
fergi.mp4 messi2.mp4
|
92 |
+
fergi.mp4 cr1.mp4
|
93 |
+
fergi.mp4 sachin.mp4
|
94 |
+
fergi.mp4 sg.mp4
|
95 |
+
fergi.mp4 fergi.mp4
|
96 |
+
fergi.mp4 spanish_lec1.mp4
|
97 |
+
fergi.mp4 bush_small.mp4
|
98 |
+
fergi.mp4 macca_cut.mp4
|
99 |
+
fergi.mp4 ca_cropped.mp4
|
100 |
+
fergi.mp4 lecun.mp4
|
101 |
+
fergi.mp4 spanish_lec0.mp4
|
102 |
+
spanish.mp4 emma_cropped.mp4
|
103 |
+
spanish.mp4 spanish.mp4
|
104 |
+
spanish.mp4 mourinho.mp4
|
105 |
+
spanish.mp4 elon.mp4
|
106 |
+
spanish.mp4 messi2.mp4
|
107 |
+
spanish.mp4 cr1.mp4
|
108 |
+
spanish.mp4 srk.mp4
|
109 |
+
spanish.mp4 sachin.mp4
|
110 |
+
spanish.mp4 sg.mp4
|
111 |
+
spanish.mp4 fergi.mp4
|
112 |
+
spanish.mp4 spanish_lec1.mp4
|
113 |
+
spanish.mp4 bush_small.mp4
|
114 |
+
spanish.mp4 macca_cut.mp4
|
115 |
+
spanish.mp4 ca_cropped.mp4
|
116 |
+
spanish.mp4 guardiola.mp4
|
117 |
+
spanish.mp4 lecun.mp4
|
118 |
+
spanish.mp4 spanish_lec0.mp4
|
119 |
+
bush_small.mp4 emma_cropped.mp4
|
120 |
+
bush_small.mp4 elon.mp4
|
121 |
+
bush_small.mp4 messi2.mp4
|
122 |
+
bush_small.mp4 spanish_lec1.mp4
|
123 |
+
bush_small.mp4 bush_small.mp4
|
124 |
+
bush_small.mp4 macca_cut.mp4
|
125 |
+
bush_small.mp4 ca_cropped.mp4
|
126 |
+
bush_small.mp4 lecun.mp4
|
127 |
+
bush_small.mp4 spanish_lec0.mp4
|
128 |
+
emma_cropped.mp4 emma_cropped.mp4
|
129 |
+
emma_cropped.mp4 elon.mp4
|
130 |
+
emma_cropped.mp4 spanish_lec1.mp4
|
131 |
+
emma_cropped.mp4 spanish_lec0.mp4
|
132 |
+
sg.mp4 emma_cropped.mp4
|
133 |
+
sg.mp4 mourinho.mp4
|
134 |
+
sg.mp4 elon.mp4
|
135 |
+
sg.mp4 messi2.mp4
|
136 |
+
sg.mp4 cr1.mp4
|
137 |
+
sg.mp4 sachin.mp4
|
138 |
+
sg.mp4 sg.mp4
|
139 |
+
sg.mp4 fergi.mp4
|
140 |
+
sg.mp4 spanish_lec1.mp4
|
141 |
+
sg.mp4 bush_small.mp4
|
142 |
+
sg.mp4 macca_cut.mp4
|
143 |
+
sg.mp4 ca_cropped.mp4
|
144 |
+
sg.mp4 lecun.mp4
|
145 |
+
sg.mp4 spanish_lec0.mp4
|
146 |
+
spanish_lec0.mp4 spanish_lec0.mp4
|
147 |
+
mourinho.mp4 emma_cropped.mp4
|
148 |
+
mourinho.mp4 mourinho.mp4
|
149 |
+
mourinho.mp4 elon.mp4
|
150 |
+
mourinho.mp4 messi2.mp4
|
151 |
+
mourinho.mp4 cr1.mp4
|
152 |
+
mourinho.mp4 sachin.mp4
|
153 |
+
mourinho.mp4 sg.mp4
|
154 |
+
mourinho.mp4 fergi.mp4
|
155 |
+
mourinho.mp4 spanish_lec1.mp4
|
156 |
+
mourinho.mp4 bush_small.mp4
|
157 |
+
mourinho.mp4 macca_cut.mp4
|
158 |
+
mourinho.mp4 ca_cropped.mp4
|
159 |
+
mourinho.mp4 lecun.mp4
|
160 |
+
mourinho.mp4 spanish_lec0.mp4
|
evaluation/test_filelists/ReSyncED/tts_pairs.txt
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
adam_1.mp4 andreng_optimization.wav
|
2 |
+
agad_2.mp4 agad_2.wav
|
3 |
+
agad_1.mp4 agad_1.wav
|
4 |
+
agad_3.mp4 agad_3.wav
|
5 |
+
rms_prop_1.mp4 rms_prop_tts.wav
|
6 |
+
tf_1.mp4 tf_1.wav
|
7 |
+
tf_2.mp4 tf_2.wav
|
8 |
+
andrew_ng_ai_business.mp4 andrewng_business_tts.wav
|
9 |
+
covid_autopsy_1.mp4 autopsy_tts.wav
|
10 |
+
news_1.mp4 news_tts.wav
|
11 |
+
andrew_ng_fund_1.mp4 andrewng_ai_fund.wav
|
12 |
+
covid_treatments_1.mp4 covid_tts.wav
|
13 |
+
pytorch_v_tf.mp4 pytorch_vs_tf_eng.wav
|
14 |
+
pytorch_1.mp4 pytorch.wav
|
15 |
+
pkb_1.mp4 pkb_1.wav
|
16 |
+
ss_1.mp4 ss_1.wav
|
17 |
+
carlsen_1.mp4 carlsen_eng.wav
|
18 |
+
french.mp4 french.wav
|
evaluation/test_filelists/lrs2.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/test_filelists/lrs3.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
evaluation/test_filelists/lrw.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
face_detection/README.md
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
The code for Face Detection in this folder has been taken from the wonderful [face_alignment](https://github.com/1adrianb/face-alignment) repository. This has been modified to take batches of faces at a time.
|
face_detection/__init__.py
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# -*- coding: utf-8 -*-
|
2 |
+
|
3 |
+
__author__ = """Adrian Bulat"""
|
4 |
+
__email__ = '[email protected]'
|
5 |
+
__version__ = '1.0.1'
|
6 |
+
|
7 |
+
from .api import FaceAlignment, LandmarksType, NetworkSize
|
face_detection/__pycache__/__init__.cpython-36.pyc
ADDED
Binary file (330 Bytes). View file
|
|
face_detection/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (338 Bytes). View file
|
|
face_detection/__pycache__/__init__.cpython-39.pyc
ADDED
Binary file (338 Bytes). View file
|
|
face_detection/__pycache__/api.cpython-36.pyc
ADDED
Binary file (2.7 kB). View file
|
|
face_detection/__pycache__/api.cpython-38.pyc
ADDED
Binary file (2.74 kB). View file
|
|
face_detection/__pycache__/api.cpython-39.pyc
ADDED
Binary file (2.73 kB). View file
|
|
face_detection/__pycache__/models.cpython-36.pyc
ADDED
Binary file (7.13 kB). View file
|
|
face_detection/__pycache__/models.cpython-38.pyc
ADDED
Binary file (7.14 kB). View file
|
|
face_detection/__pycache__/models.cpython-39.pyc
ADDED
Binary file (7.11 kB). View file
|
|
face_detection/__pycache__/utils.cpython-36.pyc
ADDED
Binary file (10.1 kB). View file
|
|
face_detection/__pycache__/utils.cpython-38.pyc
ADDED
Binary file (10.2 kB). View file
|
|
face_detection/__pycache__/utils.cpython-39.pyc
ADDED
Binary file (9.9 kB). View file
|
|
face_detection/api.py
ADDED
@@ -0,0 +1,79 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import print_function
|
2 |
+
import os
|
3 |
+
import torch
|
4 |
+
from torch.utils.model_zoo import load_url
|
5 |
+
from enum import Enum
|
6 |
+
import numpy as np
|
7 |
+
import cv2
|
8 |
+
try:
|
9 |
+
import urllib.request as request_file
|
10 |
+
except BaseException:
|
11 |
+
import urllib as request_file
|
12 |
+
|
13 |
+
from .models import FAN, ResNetDepth
|
14 |
+
from .utils import *
|
15 |
+
|
16 |
+
|
17 |
+
class LandmarksType(Enum):
|
18 |
+
"""Enum class defining the type of landmarks to detect.
|
19 |
+
|
20 |
+
``_2D`` - the detected points ``(x,y)`` are detected in a 2D space and follow the visible contour of the face
|
21 |
+
``_2halfD`` - this points represent the projection of the 3D points into 3D
|
22 |
+
``_3D`` - detect the points ``(x,y,z)``` in a 3D space
|
23 |
+
|
24 |
+
"""
|
25 |
+
_2D = 1
|
26 |
+
_2halfD = 2
|
27 |
+
_3D = 3
|
28 |
+
|
29 |
+
|
30 |
+
class NetworkSize(Enum):
|
31 |
+
# TINY = 1
|
32 |
+
# SMALL = 2
|
33 |
+
# MEDIUM = 3
|
34 |
+
LARGE = 4
|
35 |
+
|
36 |
+
def __new__(cls, value):
|
37 |
+
member = object.__new__(cls)
|
38 |
+
member._value_ = value
|
39 |
+
return member
|
40 |
+
|
41 |
+
def __int__(self):
|
42 |
+
return self.value
|
43 |
+
|
44 |
+
ROOT = os.path.dirname(os.path.abspath(__file__))
|
45 |
+
|
46 |
+
class FaceAlignment:
|
47 |
+
def __init__(self, landmarks_type, network_size=NetworkSize.LARGE,
|
48 |
+
device='cuda', flip_input=False, face_detector='sfd', verbose=False):
|
49 |
+
self.device = device
|
50 |
+
self.flip_input = flip_input
|
51 |
+
self.landmarks_type = landmarks_type
|
52 |
+
self.verbose = verbose
|
53 |
+
|
54 |
+
network_size = int(network_size)
|
55 |
+
|
56 |
+
if 'cuda' in device:
|
57 |
+
torch.backends.cudnn.benchmark = True
|
58 |
+
|
59 |
+
# Get the face detector
|
60 |
+
face_detector_module = __import__('face_detection.detection.' + face_detector,
|
61 |
+
globals(), locals(), [face_detector], 0)
|
62 |
+
self.face_detector = face_detector_module.FaceDetector(device=device, verbose=verbose)
|
63 |
+
|
64 |
+
def get_detections_for_batch(self, images):
|
65 |
+
images = images[..., ::-1]
|
66 |
+
detected_faces = self.face_detector.detect_from_batch(images.copy())
|
67 |
+
results = []
|
68 |
+
|
69 |
+
for i, d in enumerate(detected_faces):
|
70 |
+
if len(d) == 0:
|
71 |
+
results.append(None)
|
72 |
+
continue
|
73 |
+
d = d[0]
|
74 |
+
d = np.clip(d, 0, None)
|
75 |
+
|
76 |
+
x1, y1, x2, y2 = map(int, d[:-1])
|
77 |
+
results.append((x1, y1, x2, y2))
|
78 |
+
|
79 |
+
return results
|
face_detection/detection/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .core import FaceDetector
|
face_detection/detection/__pycache__/__init__.cpython-36.pyc
ADDED
Binary file (188 Bytes). View file
|
|
face_detection/detection/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (196 Bytes). View file
|
|
face_detection/detection/__pycache__/core.cpython-36.pyc
ADDED
Binary file (4.86 kB). View file
|
|
face_detection/detection/__pycache__/core.cpython-38.pyc
ADDED
Binary file (4.88 kB). View file
|
|
face_detection/detection/core.py
ADDED
@@ -0,0 +1,130 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import logging
|
2 |
+
import glob
|
3 |
+
from tqdm import tqdm
|
4 |
+
import numpy as np
|
5 |
+
import torch
|
6 |
+
import cv2
|
7 |
+
|
8 |
+
|
9 |
+
class FaceDetector(object):
|
10 |
+
"""An abstract class representing a face detector.
|
11 |
+
|
12 |
+
Any other face detection implementation must subclass it. All subclasses
|
13 |
+
must implement ``detect_from_image``, that return a list of detected
|
14 |
+
bounding boxes. Optionally, for speed considerations detect from path is
|
15 |
+
recommended.
|
16 |
+
"""
|
17 |
+
|
18 |
+
def __init__(self, device, verbose):
|
19 |
+
self.device = device
|
20 |
+
self.verbose = verbose
|
21 |
+
|
22 |
+
if verbose:
|
23 |
+
if 'cpu' in device:
|
24 |
+
logger = logging.getLogger(__name__)
|
25 |
+
logger.warning("Detection running on CPU, this may be potentially slow.")
|
26 |
+
|
27 |
+
if 'cpu' not in device and 'cuda' not in device:
|
28 |
+
if verbose:
|
29 |
+
logger.error("Expected values for device are: {cpu, cuda} but got: %s", device)
|
30 |
+
raise ValueError
|
31 |
+
|
32 |
+
def detect_from_image(self, tensor_or_path):
|
33 |
+
"""Detects faces in a given image.
|
34 |
+
|
35 |
+
This function detects the faces present in a provided BGR(usually)
|
36 |
+
image. The input can be either the image itself or the path to it.
|
37 |
+
|
38 |
+
Arguments:
|
39 |
+
tensor_or_path {numpy.ndarray, torch.tensor or string} -- the path
|
40 |
+
to an image or the image itself.
|
41 |
+
|
42 |
+
Example::
|
43 |
+
|
44 |
+
>>> path_to_image = 'data/image_01.jpg'
|
45 |
+
... detected_faces = detect_from_image(path_to_image)
|
46 |
+
[A list of bounding boxes (x1, y1, x2, y2)]
|
47 |
+
>>> image = cv2.imread(path_to_image)
|
48 |
+
... detected_faces = detect_from_image(image)
|
49 |
+
[A list of bounding boxes (x1, y1, x2, y2)]
|
50 |
+
|
51 |
+
"""
|
52 |
+
raise NotImplementedError
|
53 |
+
|
54 |
+
def detect_from_directory(self, path, extensions=['.jpg', '.png'], recursive=False, show_progress_bar=True):
|
55 |
+
"""Detects faces from all the images present in a given directory.
|
56 |
+
|
57 |
+
Arguments:
|
58 |
+
path {string} -- a string containing a path that points to the folder containing the images
|
59 |
+
|
60 |
+
Keyword Arguments:
|
61 |
+
extensions {list} -- list of string containing the extensions to be
|
62 |
+
consider in the following format: ``.extension_name`` (default:
|
63 |
+
{['.jpg', '.png']}) recursive {bool} -- option wherever to scan the
|
64 |
+
folder recursively (default: {False}) show_progress_bar {bool} --
|
65 |
+
display a progressbar (default: {True})
|
66 |
+
|
67 |
+
Example:
|
68 |
+
>>> directory = 'data'
|
69 |
+
... detected_faces = detect_from_directory(directory)
|
70 |
+
{A dictionary of [lists containing bounding boxes(x1, y1, x2, y2)]}
|
71 |
+
|
72 |
+
"""
|
73 |
+
if self.verbose:
|
74 |
+
logger = logging.getLogger(__name__)
|
75 |
+
|
76 |
+
if len(extensions) == 0:
|
77 |
+
if self.verbose:
|
78 |
+
logger.error("Expected at list one extension, but none was received.")
|
79 |
+
raise ValueError
|
80 |
+
|
81 |
+
if self.verbose:
|
82 |
+
logger.info("Constructing the list of images.")
|
83 |
+
additional_pattern = '/**/*' if recursive else '/*'
|
84 |
+
files = []
|
85 |
+
for extension in extensions:
|
86 |
+
files.extend(glob.glob(path + additional_pattern + extension, recursive=recursive))
|
87 |
+
|
88 |
+
if self.verbose:
|
89 |
+
logger.info("Finished searching for images. %s images found", len(files))
|
90 |
+
logger.info("Preparing to run the detection.")
|
91 |
+
|
92 |
+
predictions = {}
|
93 |
+
for image_path in tqdm(files, disable=not show_progress_bar):
|
94 |
+
if self.verbose:
|
95 |
+
logger.info("Running the face detector on image: %s", image_path)
|
96 |
+
predictions[image_path] = self.detect_from_image(image_path)
|
97 |
+
|
98 |
+
if self.verbose:
|
99 |
+
logger.info("The detector was successfully run on all %s images", len(files))
|
100 |
+
|
101 |
+
return predictions
|
102 |
+
|
103 |
+
@property
|
104 |
+
def reference_scale(self):
|
105 |
+
raise NotImplementedError
|
106 |
+
|
107 |
+
@property
|
108 |
+
def reference_x_shift(self):
|
109 |
+
raise NotImplementedError
|
110 |
+
|
111 |
+
@property
|
112 |
+
def reference_y_shift(self):
|
113 |
+
raise NotImplementedError
|
114 |
+
|
115 |
+
@staticmethod
|
116 |
+
def tensor_or_path_to_ndarray(tensor_or_path, rgb=True):
|
117 |
+
"""Convert path (represented as a string) or torch.tensor to a numpy.ndarray
|
118 |
+
|
119 |
+
Arguments:
|
120 |
+
tensor_or_path {numpy.ndarray, torch.tensor or string} -- path to the image, or the image itself
|
121 |
+
"""
|
122 |
+
if isinstance(tensor_or_path, str):
|
123 |
+
return cv2.imread(tensor_or_path) if not rgb else cv2.imread(tensor_or_path)[..., ::-1]
|
124 |
+
elif torch.is_tensor(tensor_or_path):
|
125 |
+
# Call cpu in case its coming from cuda
|
126 |
+
return tensor_or_path.cpu().numpy()[..., ::-1].copy() if not rgb else tensor_or_path.cpu().numpy()
|
127 |
+
elif isinstance(tensor_or_path, np.ndarray):
|
128 |
+
return tensor_or_path[..., ::-1].copy() if not rgb else tensor_or_path
|
129 |
+
else:
|
130 |
+
raise TypeError
|
face_detection/detection/sfd/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
from .sfd_detector import SFDDetector as FaceDetector
|
face_detection/detection/sfd/__pycache__/__init__.cpython-36.pyc
ADDED
Binary file (213 Bytes). View file
|
|
face_detection/detection/sfd/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (221 Bytes). View file
|
|
face_detection/detection/sfd/__pycache__/bbox.cpython-36.pyc
ADDED
Binary file (4.66 kB). View file
|
|
face_detection/detection/sfd/__pycache__/bbox.cpython-38.pyc
ADDED
Binary file (4.22 kB). View file
|
|
face_detection/detection/sfd/__pycache__/detect.cpython-36.pyc
ADDED
Binary file (3.76 kB). View file
|
|
face_detection/detection/sfd/__pycache__/detect.cpython-38.pyc
ADDED
Binary file (3.74 kB). View file
|
|
face_detection/detection/sfd/__pycache__/net_s3fd.cpython-36.pyc
ADDED
Binary file (3.87 kB). View file
|
|
face_detection/detection/sfd/__pycache__/net_s3fd.cpython-38.pyc
ADDED
Binary file (3.87 kB). View file
|
|
face_detection/detection/sfd/__pycache__/sfd_detector.cpython-36.pyc
ADDED
Binary file (2.95 kB). View file
|
|
face_detection/detection/sfd/__pycache__/sfd_detector.cpython-38.pyc
ADDED
Binary file (2.98 kB). View file
|
|
face_detection/detection/sfd/bbox.py
ADDED
@@ -0,0 +1,129 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from __future__ import print_function
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
import cv2
|
5 |
+
import random
|
6 |
+
import datetime
|
7 |
+
import time
|
8 |
+
import math
|
9 |
+
import argparse
|
10 |
+
import numpy as np
|
11 |
+
import torch
|
12 |
+
|
13 |
+
try:
|
14 |
+
from iou import IOU
|
15 |
+
except BaseException:
|
16 |
+
# IOU cython speedup 10x
|
17 |
+
def IOU(ax1, ay1, ax2, ay2, bx1, by1, bx2, by2):
|
18 |
+
sa = abs((ax2 - ax1) * (ay2 - ay1))
|
19 |
+
sb = abs((bx2 - bx1) * (by2 - by1))
|
20 |
+
x1, y1 = max(ax1, bx1), max(ay1, by1)
|
21 |
+
x2, y2 = min(ax2, bx2), min(ay2, by2)
|
22 |
+
w = x2 - x1
|
23 |
+
h = y2 - y1
|
24 |
+
if w < 0 or h < 0:
|
25 |
+
return 0.0
|
26 |
+
else:
|
27 |
+
return 1.0 * w * h / (sa + sb - w * h)
|
28 |
+
|
29 |
+
|
30 |
+
def bboxlog(x1, y1, x2, y2, axc, ayc, aww, ahh):
|
31 |
+
xc, yc, ww, hh = (x2 + x1) / 2, (y2 + y1) / 2, x2 - x1, y2 - y1
|
32 |
+
dx, dy = (xc - axc) / aww, (yc - ayc) / ahh
|
33 |
+
dw, dh = math.log(ww / aww), math.log(hh / ahh)
|
34 |
+
return dx, dy, dw, dh
|
35 |
+
|
36 |
+
|
37 |
+
def bboxloginv(dx, dy, dw, dh, axc, ayc, aww, ahh):
|
38 |
+
xc, yc = dx * aww + axc, dy * ahh + ayc
|
39 |
+
ww, hh = math.exp(dw) * aww, math.exp(dh) * ahh
|
40 |
+
x1, x2, y1, y2 = xc - ww / 2, xc + ww / 2, yc - hh / 2, yc + hh / 2
|
41 |
+
return x1, y1, x2, y2
|
42 |
+
|
43 |
+
|
44 |
+
def nms(dets, thresh):
|
45 |
+
if 0 == len(dets):
|
46 |
+
return []
|
47 |
+
x1, y1, x2, y2, scores = dets[:, 0], dets[:, 1], dets[:, 2], dets[:, 3], dets[:, 4]
|
48 |
+
areas = (x2 - x1 + 1) * (y2 - y1 + 1)
|
49 |
+
order = scores.argsort()[::-1]
|
50 |
+
|
51 |
+
keep = []
|
52 |
+
while order.size > 0:
|
53 |
+
i = order[0]
|
54 |
+
keep.append(i)
|
55 |
+
xx1, yy1 = np.maximum(x1[i], x1[order[1:]]), np.maximum(y1[i], y1[order[1:]])
|
56 |
+
xx2, yy2 = np.minimum(x2[i], x2[order[1:]]), np.minimum(y2[i], y2[order[1:]])
|
57 |
+
|
58 |
+
w, h = np.maximum(0.0, xx2 - xx1 + 1), np.maximum(0.0, yy2 - yy1 + 1)
|
59 |
+
ovr = w * h / (areas[i] + areas[order[1:]] - w * h)
|
60 |
+
|
61 |
+
inds = np.where(ovr <= thresh)[0]
|
62 |
+
order = order[inds + 1]
|
63 |
+
|
64 |
+
return keep
|
65 |
+
|
66 |
+
|
67 |
+
def encode(matched, priors, variances):
|
68 |
+
"""Encode the variances from the priorbox layers into the ground truth boxes
|
69 |
+
we have matched (based on jaccard overlap) with the prior boxes.
|
70 |
+
Args:
|
71 |
+
matched: (tensor) Coords of ground truth for each prior in point-form
|
72 |
+
Shape: [num_priors, 4].
|
73 |
+
priors: (tensor) Prior boxes in center-offset form
|
74 |
+
Shape: [num_priors,4].
|
75 |
+
variances: (list[float]) Variances of priorboxes
|
76 |
+
Return:
|
77 |
+
encoded boxes (tensor), Shape: [num_priors, 4]
|
78 |
+
"""
|
79 |
+
|
80 |
+
# dist b/t match center and prior's center
|
81 |
+
g_cxcy = (matched[:, :2] + matched[:, 2:]) / 2 - priors[:, :2]
|
82 |
+
# encode variance
|
83 |
+
g_cxcy /= (variances[0] * priors[:, 2:])
|
84 |
+
# match wh / prior wh
|
85 |
+
g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:]
|
86 |
+
g_wh = torch.log(g_wh) / variances[1]
|
87 |
+
# return target for smooth_l1_loss
|
88 |
+
return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4]
|
89 |
+
|
90 |
+
|
91 |
+
def decode(loc, priors, variances):
|
92 |
+
"""Decode locations from predictions using priors to undo
|
93 |
+
the encoding we did for offset regression at train time.
|
94 |
+
Args:
|
95 |
+
loc (tensor): location predictions for loc layers,
|
96 |
+
Shape: [num_priors,4]
|
97 |
+
priors (tensor): Prior boxes in center-offset form.
|
98 |
+
Shape: [num_priors,4].
|
99 |
+
variances: (list[float]) Variances of priorboxes
|
100 |
+
Return:
|
101 |
+
decoded bounding box predictions
|
102 |
+
"""
|
103 |
+
|
104 |
+
boxes = torch.cat((
|
105 |
+
priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:],
|
106 |
+
priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1)
|
107 |
+
boxes[:, :2] -= boxes[:, 2:] / 2
|
108 |
+
boxes[:, 2:] += boxes[:, :2]
|
109 |
+
return boxes
|
110 |
+
|
111 |
+
def batch_decode(loc, priors, variances):
|
112 |
+
"""Decode locations from predictions using priors to undo
|
113 |
+
the encoding we did for offset regression at train time.
|
114 |
+
Args:
|
115 |
+
loc (tensor): location predictions for loc layers,
|
116 |
+
Shape: [num_priors,4]
|
117 |
+
priors (tensor): Prior boxes in center-offset form.
|
118 |
+
Shape: [num_priors,4].
|
119 |
+
variances: (list[float]) Variances of priorboxes
|
120 |
+
Return:
|
121 |
+
decoded bounding box predictions
|
122 |
+
"""
|
123 |
+
|
124 |
+
boxes = torch.cat((
|
125 |
+
priors[:, :, :2] + loc[:, :, :2] * variances[0] * priors[:, :, 2:],
|
126 |
+
priors[:, :, 2:] * torch.exp(loc[:, :, 2:] * variances[1])), 2)
|
127 |
+
boxes[:, :, :2] -= boxes[:, :, 2:] / 2
|
128 |
+
boxes[:, :, 2:] += boxes[:, :, :2]
|
129 |
+
return boxes
|