File size: 13,486 Bytes
135075d
 
 
 
dac6ef2
135075d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
414370c
135075d
 
 
 
 
d9fefdd
135075d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
df1c27c
 
135075d
 
 
 
 
 
 
 
 
 
 
 
414370c
135075d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
962799e
135075d
 
 
 
 
 
 
f80eb5f
 
 
 
 
 
 
 
 
 
1c79bce
 
 
 
 
 
f80eb5f
 
 
 
 
281658f
135075d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a337a9
135075d
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
import os
import cv2
import argparse
import glob
import spaces
import torch
import numpy as np
from tqdm import tqdm
from torchvision.transforms.functional import normalize
from basicsr.utils import imwrite, img2tensor, tensor2img
from basicsr.utils.download_util import load_file_from_url
from basicsr.utils.misc import gpu_is_available, get_device
from scipy.ndimage import gaussian_filter1d
from facelib.utils.face_restoration_helper import FaceRestoreHelper
from facelib.utils.misc import is_gray
from basicsr.utils.video_util import VideoReader, VideoWriter
from basicsr.utils.registry import ARCH_REGISTRY
import gradio as gr
from torch.hub import download_url_to_file

title = r"""<h1 align="center">KEEP: Kalman-Inspired Feature Propagation for Video Face Super-Resolution</h1>"""

description = r"""
<b>Official Gradio demo</b> for <a href='https://github.com/jnjaby/KEEP' target='_blank'><b>Kalman-Inspired FEaturE Propagation for Video Face Super-Resolution (ECCV 2024)</b></a>.<br>
πŸ”₯ KEEP is a robust video face super-resolution algorithm.<br>
πŸ€— Try to drop your own face video, and get the restored results!<br>
"""

post_article = r"""
If you found KEEP helpful, please consider ⭐ the <a href='https://github.com/jnjaby/KEEP' target='_blank'>Github Repo</a>. Thanks! 
[![GitHub Stars](https://img.shields.io/github/stars/jnjaby/KEEP)](https://github.com/jnjaby/KEEP)
---
πŸ“ **Citation**
<br>
If our work is useful for your research, please consider citing:
```bibtex
@InProceedings{feng2024keep,
      title     = {Kalman-Inspired FEaturE Propagation for Video Face Super-Resolution},
      author    = {Feng, Ruicheng and Li, Chongyi and Loy, Chen Change},
      booktitle = {European Conference on Computer Vision (ECCV)},
      year      = {2024}
}
```

πŸ“‹ **License**
<br>
This project is licensed under <a rel="license" href="https://github.com/jnjaby/KEEP/blob/main/LICENSE">S-Lab License 1.0</a>. 
Redistribution and use for non-commercial purposes should follow this license.
<br><br>
πŸ“§ **Contact**
<br>
If you have any questions, please feel free to reach out via <b>[email protected]</b>.
"""



def interpolate_sequence(sequence):
    interpolated_sequence = np.copy(sequence)
    missing_indices = np.isnan(sequence)
    if np.any(missing_indices):
        valid_indices = ~missing_indices
        x = np.arange(len(sequence))
        interpolated_sequence[missing_indices] = np.interp(x[missing_indices], x[valid_indices], sequence[valid_indices])
    return interpolated_sequence

def set_realesrgan():
    from basicsr.archs.rrdbnet_arch import RRDBNet
    from basicsr.utils.realesrgan_utils import RealESRGANer
    use_half = False
    if torch.cuda.is_available():
        no_half_gpu_list = ['1650', '1660']
        if not any(gpu in torch.cuda.get_device_name(0) for gpu in no_half_gpu_list):
            use_half = True
    model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=2)
    upsampler = RealESRGANer(scale=2, model_path="https://github.com/jnjaby/KEEP/releases/download/v1.0.0/RealESRGAN_x2plus.pth", model=model, tile=400, tile_pad=40, pre_pad=0, half=use_half)
    if not gpu_is_available():
        import warnings
        warnings.warn('Running on CPU now! Make sure your PyTorch version matches your CUDA. The unoptimized RealESRGAN is slow on CPU.', category=RuntimeWarning)
    return upsampler

@spaces.GPU(duration=300)
def process_video(input_video, draw_box, bg_enhancement):
    device = get_device()
    args = argparse.Namespace(
        input_path=input_video,
        upscale=1,
        max_length=20,
        has_aligned=False,
        only_center_face=True,
        draw_box=draw_box,
        detection_model='retinaface_resnet50',
        bg_enhancement=bg_enhancement,
        face_upsample=False,
        bg_tile=400,
        suffix=None,
        save_video_fps=None,
        model_type='KEEP',
        progress=gr.Progress(track_tqdm=True)
    )

    output_dir = './results/'
    os.makedirs(output_dir, exist_ok=True)

    model_configs = {
        'KEEP': {
            'architecture': {
                'img_size': 512, 'emb_dim': 256, 'dim_embd': 512, 'n_head': 8, 'n_layers': 9,
                'codebook_size': 1024, 'cft_list': ['16', '32', '64'], 'kalman_attn_head_dim': 48,
                'num_uncertainty_layers': 3, 'cfa_list': ['16', '32'], 'cfa_nhead': 4, 'cfa_dim': 256, 'cond': 1
            },
            'checkpoint_dir': '/home/user/app/weights/KEEP',
            'checkpoint_url': 'https://github.com/jnjaby/KEEP/releases/download/v1.0.0/KEEP-b76feb75.pth'
        },
    }
    if args.bg_enhancement:
        bg_upsampler = set_realesrgan()
    else:
        bg_upsampler = None
    if args.face_upsample:
        face_upsampler = bg_upsampler if bg_upsampler is not None else set_realesrgan()
    else:
        face_upsampler = None

    if args.model_type not in model_configs:
        raise ValueError(f"Unknown model type: {args.model_type}. Available options: {list(model_configs.keys())}")
    config = model_configs[args.model_type]
    net = ARCH_REGISTRY.get('KEEP')(**config['architecture']).to(device)
    ckpt_path = load_file_from_url(url=config['checkpoint_url'], model_dir=config['checkpoint_dir'], progress=True, file_name=None)
    checkpoint = torch.load(ckpt_path, weights_only=True)
    net.load_state_dict(checkpoint['params_ema'])
    net.eval()
    if not args.has_aligned:
        print(f'Face detection model: {args.detection_model}')
    if bg_upsampler is not None:
        print(f'Background upsampling: True, Face upsampling: {args.face_upsample}')
    else:
        print(f'Background upsampling: False, Face upsampling: {args.face_upsample}')
    face_helper = FaceRestoreHelper(args.upscale, face_size=512, crop_ratio=(1, 1), det_model=args.detection_model, save_ext='png', use_parse=True, device=device)

    # Reading the input video.
    input_img_list = []
    if args.input_path.endswith(('mp4', 'mov', 'avi', 'MP4', 'MOV', 'AVI')):
        vidreader = VideoReader(args.input_path)
        image = vidreader.get_frame()
        while image is not None:
            input_img_list.append(image)
            image = vidreader.get_frame()
        fps = vidreader.get_fps() if args.save_video_fps is None else args.save_video_fps
        vidreader.close()
        clip_name = os.path.basename(args.input_path)[:-4]
    else:
        raise TypeError(f'Unrecognized type of input video {args.input_path}.')
    if len(input_img_list) == 0:
        raise FileNotFoundError('No input image/video is found...')

    print('Detecting keypoints and smooth alignment ...')
    if not args.has_aligned:
        raw_landmarks = []
        for i, img in enumerate(input_img_list):
            face_helper.clean_all()
            face_helper.read_image(img)
            num_det_faces = face_helper.get_face_landmarks_5(only_center_face=args.only_center_face, resize=640, eye_dist_threshold=5, only_keep_largest=True)
            if num_det_faces == 1:
                raw_landmarks.append(face_helper.all_landmarks_5[0].reshape((10,)))
            elif num_det_faces == 0:
                raw_landmarks.append(np.array([np.nan]*10))
        raw_landmarks = np.array(raw_landmarks)
        for i in range(10):
            raw_landmarks[:, i] = interpolate_sequence(raw_landmarks[:, i])
        video_length = len(input_img_list)
        avg_landmarks = gaussian_filter1d(raw_landmarks, 5, axis=0).reshape(video_length, 5, 2)
    cropped_faces = []
    for i, img in enumerate(input_img_list):
        face_helper.clean_all()
        face_helper.read_image(img)
        face_helper.all_landmarks_5 = [avg_landmarks[i]]
        face_helper.align_warp_face()
        cropped_face_t = img2tensor(face_helper.cropped_faces[0] / 255., bgr2rgb=True, float32=True)
        normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True)
        cropped_faces.append(cropped_face_t)
    cropped_faces = torch.stack(cropped_faces, dim=0).unsqueeze(0).to(device)
    print('Restoring faces ...')
    with torch.no_grad():
        video_length = cropped_faces.shape[1]
        output = []
        for start_idx in range(0, video_length, args.max_length):
            end_idx = min(start_idx + args.max_length, video_length)
            if end_idx - start_idx == 1:
                output.append(net(cropped_faces[:, [start_idx, start_idx], ...], need_upscale=False)[:, 0:1, ...])
            else:
                output.append(net(cropped_faces[:, start_idx:end_idx, ...], need_upscale=False))
        output = torch.cat(output, dim=1).squeeze(0)
        assert output.shape[0] == video_length, "Different number of frames"
        restored_faces = [tensor2img(x, rgb2bgr=True, min_max=(-1, 1)) for x in output]
        del output
        torch.cuda.empty_cache()
    print('Pasting faces back ...')

    restored_frames = []
    for i, img in enumerate(input_img_list):
        face_helper.clean_all()
        if args.has_aligned:
            img = cv2.resize(img, (512, 512), interpolation=cv2.INTER_LINEAR)
            face_helper.is_gray = is_gray(img, threshold=10)
            if face_helper.is_gray:
                print('Grayscale input: True')
            face_helper.cropped_faces = [img]
        else:
            face_helper.read_image(img)
            face_helper.all_landmarks_5 = [avg_landmarks[i]]
            face_helper.align_warp_face()
        face_helper.add_restored_face(restored_faces[i].astype('uint8'))
        if not args.has_aligned:
            if bg_upsampler is not None:
                bg_img = bg_upsampler.enhance(img, outscale=args.upscale)[0]
            else:
                bg_img = None
            face_helper.get_inverse_affine(None)
            if args.face_upsample and face_upsampler is not None:
                restored_img = face_helper.paste_faces_to_input_image(upsample_img=bg_img, draw_box=args.draw_box, face_upsampler=face_upsampler)
            else:
                restored_img = face_helper.paste_faces_to_input_image(upsample_img=bg_img, draw_box=args.draw_box)

        restored_frames.append(restored_img)

    # Saving the output video.
    print('Saving video ...')
    height, width = restored_frames[0].shape[:2]
    save_restore_path = os.path.join(output_dir, f'{clip_name}.mp4')
    vidwriter = VideoWriter(save_restore_path, height, width, fps)
    for f in restored_frames:
        vidwriter.write_frame(f)
    vidwriter.close()
    print(f'All results are saved in {save_restore_path}.')
    return save_restore_path

# Downloading necessary models and sample videos.
sample_videos_dir = os.path.join("/home/user/app/hugging_face/", "test_sample/")
os.makedirs(sample_videos_dir, exist_ok=True)
download_url_to_file("https://github.com/jnjaby/KEEP/releases/download/media/real_1.mp4", os.path.join(sample_videos_dir, "real_1.mp4"))
download_url_to_file("https://github.com/jnjaby/KEEP/releases/download/media/real_2.mp4", os.path.join(sample_videos_dir, "real_2.mp4"))
download_url_to_file("https://github.com/jnjaby/KEEP/releases/download/media/real_3.mp4", os.path.join(sample_videos_dir, "real_3.mp4"))
download_url_to_file("https://github.com/jnjaby/KEEP/releases/download/media/real_4.mp4", os.path.join(sample_videos_dir, "real_4.mp4"))


model_dir = "/home/user/app/weights/"
model_url = "https://github.com/jnjaby/KEEP/releases/download/v1.0.0/"

_ = load_file_from_url(url=os.path.join(model_url, 'KEEP-b76feb75.pth'),
                        model_dir=os.path.join(model_dir, "KEEP"), progress=True, file_name=None)

_ = load_file_from_url(url=os.path.join(model_url, 'detection_Resnet50_Final.pth'),
                        model_dir=os.path.join(model_dir, "facelib"), progress=True, file_name=None)
_ = load_file_from_url(url=os.path.join(model_url, 'detection_mobilenet0.25_Final.pth'),
                        model_dir=os.path.join(model_dir, "facelib"), progress=True, file_name=None)
_ = load_file_from_url(url=os.path.join(model_url, 'yolov5n-face.pth'),
                        model_dir=os.path.join(model_dir, "facelib"), progress=True, file_name=None)
_ = load_file_from_url(url=os.path.join(model_url, 'yolov5l-face.pth'),
                        model_dir=os.path.join(model_dir, "facelib"), progress=True, file_name=None)
_ = load_file_from_url(url=os.path.join(model_url, 'parsing_parsenet.pth'),
                        model_dir=os.path.join(model_dir, "facelib"), progress=True, file_name=None)

_ = load_file_from_url(url=os.path.join(model_url, 'RealESRGAN_x2plus.pth'),
                        model_dir=os.path.join(model_dir, "realesrgan"), progress=True, file_name=None)



# Launching the Gradio interface.
demo = gr.Interface(
    fn=process_video,
    title=title,
    description=description,
    inputs=[
        gr.Video(label="Input Video"),
        gr.Checkbox(label="Draw Box", value=False),
        gr.Checkbox(label="Background Enhancement", value=False),
    ],
    outputs=gr.Video(label="Processed Video"),
    examples=[
        [os.path.join(os.path.dirname(__file__), sample_videos_dir, "real_1.mp4"), True, False],
        [os.path.join(os.path.dirname(__file__), sample_videos_dir, "real_2.mp4"), True, False],
        [os.path.join(os.path.dirname(__file__), sample_videos_dir, "real_3.mp4"), True, False],
        [os.path.join(os.path.dirname(__file__), sample_videos_dir, "real_4.mp4"), True, False],
    ],
    cache_examples=False,
    article=post_article
)


demo.launch(share=True)