|
import tensorflow as tf |
|
import numpy as np |
|
from einops import rearrange |
|
from decord import VideoReader |
|
|
|
num_frames = 32 |
|
input_size = 224 |
|
patch_size = (16, 16) |
|
IMAGENET_MEAN = np.array([123.675, 116.28, 103.53]) |
|
IMAGENET_STD = np.array([58.395, 57.12, 57.375]) |
|
|
|
def format_frames(frame, output_size): |
|
frame = tf.image.convert_image_dtype(frame, tf.uint8) |
|
frame = tf.image.resize(frame, size=output_size) |
|
frame = frame - IMAGENET_MEAN |
|
frame = frame / IMAGENET_STD |
|
return frame |
|
|
|
def read_video(file_path): |
|
container = VideoReader(file_path) |
|
return container |
|
|
|
def frame_sampling(container, num_frames): |
|
interval = len(container) // num_frames |
|
bids = np.arange(num_frames) * interval |
|
offset = np.random.randint(interval, size=bids.shape) |
|
frame_index = bids + offset |
|
frames = container.get_batch(frame_index).asnumpy() |
|
frames = np.stack(frames) |
|
frames = format_frames(frames, [input_size] * 2) |
|
return frames |
|
|
|
def denormalize(z): |
|
mean = np.array([123.675, 116.28, 103.53]) |
|
variance = np.array([np.square(58.395), np.square(57.12), np.square(57.375)]) |
|
std = np.sqrt(variance) |
|
x = (z * std) + mean |
|
x = x.clip(0, 255) |
|
return x |