import tensorflow as tf import numpy as np from einops import rearrange from decord import VideoReader num_frames = 32 input_size = 224 patch_size = (16, 16) IMAGENET_MEAN = np.array([123.675, 116.28, 103.53]) IMAGENET_STD = np.array([58.395, 57.12, 57.375]) def format_frames(frame, output_size): frame = tf.image.convert_image_dtype(frame, tf.uint8) frame = tf.image.resize(frame, size=output_size) frame = frame - IMAGENET_MEAN frame = frame / IMAGENET_STD return frame def read_video(file_path): container = VideoReader(file_path) return container def frame_sampling(container, num_frames): interval = len(container) // num_frames bids = np.arange(num_frames) * interval offset = np.random.randint(interval, size=bids.shape) frame_index = bids + offset frames = container.get_batch(frame_index).asnumpy() frames = np.stack(frames) frames = format_frames(frames, [input_size] * 2) return frames def denormalize(z): mean = np.array([123.675, 116.28, 103.53]) variance = np.array([np.square(58.395), np.square(57.12), np.square(57.375)]) std = np.sqrt(variance) # no need var and std, todo: update here! x = (z * std) + mean x = x.clip(0, 255) return x