herrius's picture
Upload 259 files
32b542e
# Copyright (c) Facebook, Inc. and its affiliates. All Rights Reserved.
import math
import numpy as np
import torch
import numpy as np
from PIL import Image
# pytorch=1.7.1
from torchvision.transforms import Compose, Resize, CenterCrop, ToTensor, Normalize
# pip install opencv-python
import cv2
import random
try:
import ffmpeg
except:
pass
import av
import math
def random_short_side_scale_jitter(
images, min_size, max_size, boxes=None, inverse_uniform_sampling=False
):
"""
Perform a spatial short scale jittering on the given images and
corresponding boxes.
Args:
images (tensor): images to perform scale jitter. Dimension is
`num frames` x `channel` x `height` x `width`.
min_size (int): the minimal size to scale the frames.
max_size (int): the maximal size to scale the frames.
boxes (ndarray): optional. Corresponding boxes to images.
Dimension is `num boxes` x 4.
inverse_uniform_sampling (bool): if True, sample uniformly in
[1 / max_scale, 1 / min_scale] and take a reciprocal to get the
scale. If False, take a uniform sample from [min_scale, max_scale].
Returns:
(tensor): the scaled images with dimension of
`num frames` x `channel` x `new height` x `new width`.
(ndarray or None): the scaled boxes with dimension of
`num boxes` x 4.
"""
if inverse_uniform_sampling:
size = int(
round(1.0 / np.random.uniform(1.0 / max_size, 1.0 / min_size))
)
else:
size = int(round(np.random.uniform(min_size, max_size)))
height = images.shape[2]
width = images.shape[3]
if (width <= height and width == size) or (
height <= width and height == size
):
return images, boxes
new_width = size
new_height = size
if width < height:
new_height = int(math.floor((float(height) / width) * size))
if boxes is not None:
boxes = boxes * float(new_height) / height
else:
new_width = int(math.floor((float(width) / height) * size))
if boxes is not None:
boxes = boxes * float(new_width) / width
return (
torch.nn.functional.interpolate(
images,
size=(new_height, new_width),
mode="bilinear",
align_corners=False,
),
boxes,
)
def crop_boxes(boxes, x_offset, y_offset):
"""
Peform crop on the bounding boxes given the offsets.
Args:
boxes (ndarray or None): bounding boxes to peform crop. The dimension
is `num boxes` x 4.
x_offset (int): cropping offset in the x axis.
y_offset (int): cropping offset in the y axis.
Returns:
cropped_boxes (ndarray or None): the cropped boxes with dimension of
`num boxes` x 4.
"""
cropped_boxes = boxes.copy()
cropped_boxes[:, [0, 2]] = boxes[:, [0, 2]] - x_offset
cropped_boxes[:, [1, 3]] = boxes[:, [1, 3]] - y_offset
return cropped_boxes
def random_crop(images, size, boxes=None):
"""
Perform random spatial crop on the given images and corresponding boxes.
Args:
images (tensor): images to perform random crop. The dimension is
`num frames` x `channel` x `height` x `width`.
size (int): the size of height and width to crop on the image.
boxes (ndarray or None): optional. Corresponding boxes to images.
Dimension is `num boxes` x 4.
Returns:
cropped (tensor): cropped images with dimension of
`num frames` x `channel` x `size` x `size`.
cropped_boxes (ndarray or None): the cropped boxes with dimension of
`num boxes` x 4.
"""
if images.shape[2] == size and images.shape[3] == size:
return images, None
height = images.shape[2]
width = images.shape[3]
y_offset = 0
if height > size:
y_offset = int(np.random.randint(0, height - size))
x_offset = 0
if width > size:
x_offset = int(np.random.randint(0, width - size))
cropped = images[
:, :, y_offset : y_offset + size, x_offset : x_offset + size
]
cropped_boxes = (
crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
)
return cropped, cropped_boxes
def horizontal_flip(prob, images, boxes=None):
"""
Perform horizontal flip on the given images and corresponding boxes.
Args:
prob (float): probility to flip the images.
images (tensor): images to perform horizontal flip, the dimension is
`num frames` x `channel` x `height` x `width`.
boxes (ndarray or None): optional. Corresponding boxes to images.
Dimension is `num boxes` x 4.
Returns:
images (tensor): images with dimension of
`num frames` x `channel` x `height` x `width`.
flipped_boxes (ndarray or None): the flipped boxes with dimension of
`num boxes` x 4.
"""
if boxes is None:
flipped_boxes = None
else:
flipped_boxes = boxes.copy()
if np.random.uniform() < prob:
images = images.flip((-1))
width = images.shape[3]
if boxes is not None:
flipped_boxes[:, [0, 2]] = width - boxes[:, [2, 0]] - 1
return images, flipped_boxes
def uniform_crop(images, size, spatial_idx, boxes=None):
"""
Perform uniform spatial sampling on the images and corresponding boxes.
Args:
images (tensor): images to perform uniform crop. The dimension is
`num frames` x `channel` x `height` x `width`.
size (int): size of height and weight to crop the images.
spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
is larger than height. Or 0, 1, or 2 for top, center, and bottom
crop if height is larger than width.
boxes (ndarray or None): optional. Corresponding boxes to images.
Dimension is `num boxes` x 4.
Returns:
cropped (tensor): images with dimension of
`num frames` x `channel` x `size` x `size`.
cropped_boxes (ndarray or None): the cropped boxes with dimension of
`num boxes` x 4.
"""
assert spatial_idx in [0, 1, 2]
height = images.shape[2]
width = images.shape[3]
y_offset = int(math.ceil((height - size) / 2))
x_offset = int(math.ceil((width - size) / 2))
if height > width:
if spatial_idx == 0:
y_offset = 0
elif spatial_idx == 2:
y_offset = height - size
else:
if spatial_idx == 0:
x_offset = 0
elif spatial_idx == 2:
x_offset = width - size
cropped = images[
:, :, y_offset : y_offset + size, x_offset : x_offset + size
]
cropped_boxes = (
crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
)
return cropped, cropped_boxes
def uniform_crop_2crops(images, size, spatial_idx, boxes=None):
"""
Perform uniform spatial sampling on the images and corresponding boxes.
Args:
images (tensor): images to perform uniform crop. The dimension is
`num frames` x `channel` x `height` x `width`.
size (int): size of height and weight to crop the images.
spatial_idx (int): 0, 1, or 2 for left, center, and right crop if width
is larger than height. Or 0, 1, or 2 for top, center, and bottom
crop if height is larger than width.
boxes (ndarray or None): optional. Corresponding boxes to images.
Dimension is `num boxes` x 4.
Returns:
cropped (tensor): images with dimension of
`num frames` x `channel` x `size` x `size`.
cropped_boxes (ndarray or None): the cropped boxes with dimension of
`num boxes` x 4.
"""
assert spatial_idx in [0, 1, 2]
height = images.shape[2]
width = images.shape[3]
if height > width:
x_offset = 0
if height > size * 2:
if spatial_idx == 0:
y_offset = int((height - size * 2) // 2)
elif spatial_idx == 1:
y_offset = int(height - size - ((height - size * 2) // 2))
else:
if spatial_idx == 0:
y_offset = 0
elif spatial_idx == 1:
y_offset = height - size
else:
y_offset = 0
if width > size * 2:
if spatial_idx == 0:
x_offset = int((width - size * 2) // 2)
elif spatial_idx == 1:
x_offset = int(width - size - ((width - size * 2) // 2))
else:
if spatial_idx == 0:
x_offset = 0
elif spatial_idx == 1:
x_offset = width - size
cropped = images[
:, :, y_offset : y_offset + size, x_offset : x_offset + size
]
cropped_boxes = (
crop_boxes(boxes, x_offset, y_offset) if boxes is not None else None
)
return cropped, cropped_boxes
def clip_boxes_to_image(boxes, height, width):
"""
Clip an array of boxes to an image with the given height and width.
Args:
boxes (ndarray): bounding boxes to perform clipping.
Dimension is `num boxes` x 4.
height (int): given image height.
width (int): given image width.
Returns:
clipped_boxes (ndarray): the clipped boxes with dimension of
`num boxes` x 4.
"""
clipped_boxes = boxes.copy()
clipped_boxes[:, [0, 2]] = np.minimum(
width - 1.0, np.maximum(0.0, boxes[:, [0, 2]])
)
clipped_boxes[:, [1, 3]] = np.minimum(
height - 1.0, np.maximum(0.0, boxes[:, [1, 3]])
)
return clipped_boxes
def blend(images1, images2, alpha):
"""
Blend two images with a given weight alpha.
Args:
images1 (tensor): the first images to be blended, the dimension is
`num frames` x `channel` x `height` x `width`.
images2 (tensor): the second images to be blended, the dimension is
`num frames` x `channel` x `height` x `width`.
alpha (float): the blending weight.
Returns:
(tensor): blended images, the dimension is
`num frames` x `channel` x `height` x `width`.
"""
return images1 * alpha + images2 * (1 - alpha)
def grayscale(images):
"""
Get the grayscale for the input images. The channels of images should be
in order BGR.
Args:
images (tensor): the input images for getting grayscale. Dimension is
`num frames` x `channel` x `height` x `width`.
Returns:
img_gray (tensor): blended images, the dimension is
`num frames` x `channel` x `height` x `width`.
"""
# R -> 0.299, G -> 0.587, B -> 0.114.
img_gray = torch.tensor(images)
gray_channel = (
0.299 * images[:, 2] + 0.587 * images[:, 1] + 0.114 * images[:, 0]
)
img_gray[:, 0] = gray_channel
img_gray[:, 1] = gray_channel
img_gray[:, 2] = gray_channel
return img_gray
def color_jitter(images, img_brightness=0, img_contrast=0, img_saturation=0):
"""
Perfrom a color jittering on the input images. The channels of images
should be in order BGR.
Args:
images (tensor): images to perform color jitter. Dimension is
`num frames` x `channel` x `height` x `width`.
img_brightness (float): jitter ratio for brightness.
img_contrast (float): jitter ratio for contrast.
img_saturation (float): jitter ratio for saturation.
Returns:
images (tensor): the jittered images, the dimension is
`num frames` x `channel` x `height` x `width`.
"""
jitter = []
if img_brightness != 0:
jitter.append("brightness")
if img_contrast != 0:
jitter.append("contrast")
if img_saturation != 0:
jitter.append("saturation")
if len(jitter) > 0:
order = np.random.permutation(np.arange(len(jitter)))
for idx in range(0, len(jitter)):
if jitter[order[idx]] == "brightness":
images = brightness_jitter(img_brightness, images)
elif jitter[order[idx]] == "contrast":
images = contrast_jitter(img_contrast, images)
elif jitter[order[idx]] == "saturation":
images = saturation_jitter(img_saturation, images)
return images
def brightness_jitter(var, images):
"""
Perfrom brightness jittering on the input images. The channels of images
should be in order BGR.
Args:
var (float): jitter ratio for brightness.
images (tensor): images to perform color jitter. Dimension is
`num frames` x `channel` x `height` x `width`.
Returns:
images (tensor): the jittered images, the dimension is
`num frames` x `channel` x `height` x `width`.
"""
alpha = 1.0 + np.random.uniform(-var, var)
img_bright = torch.zeros(images.shape)
images = blend(images, img_bright, alpha)
return images
def contrast_jitter(var, images):
"""
Perfrom contrast jittering on the input images. The channels of images
should be in order BGR.
Args:
var (float): jitter ratio for contrast.
images (tensor): images to perform color jitter. Dimension is
`num frames` x `channel` x `height` x `width`.
Returns:
images (tensor): the jittered images, the dimension is
`num frames` x `channel` x `height` x `width`.
"""
alpha = 1.0 + np.random.uniform(-var, var)
img_gray = grayscale(images)
img_gray[:] = torch.mean(img_gray, dim=(1, 2, 3), keepdim=True)
images = blend(images, img_gray, alpha)
return images
def saturation_jitter(var, images):
"""
Perfrom saturation jittering on the input images. The channels of images
should be in order BGR.
Args:
var (float): jitter ratio for saturation.
images (tensor): images to perform color jitter. Dimension is
`num frames` x `channel` x `height` x `width`.
Returns:
images (tensor): the jittered images, the dimension is
`num frames` x `channel` x `height` x `width`.
"""
alpha = 1.0 + np.random.uniform(-var, var)
img_gray = grayscale(images)
images = blend(images, img_gray, alpha)
return images
def lighting_jitter(images, alphastd, eigval, eigvec):
"""
Perform AlexNet-style PCA jitter on the given images.
Args:
images (tensor): images to perform lighting jitter. Dimension is
`num frames` x `channel` x `height` x `width`.
alphastd (float): jitter ratio for PCA jitter.
eigval (list): eigenvalues for PCA jitter.
eigvec (list[list]): eigenvectors for PCA jitter.
Returns:
out_images (tensor): the jittered images, the dimension is
`num frames` x `channel` x `height` x `width`.
"""
if alphastd == 0:
return images
# generate alpha1, alpha2, alpha3.
alpha = np.random.normal(0, alphastd, size=(1, 3))
eig_vec = np.array(eigvec)
eig_val = np.reshape(eigval, (1, 3))
rgb = np.sum(
eig_vec * np.repeat(alpha, 3, axis=0) * np.repeat(eig_val, 3, axis=0),
axis=1,
)
out_images = torch.zeros_like(images)
for idx in range(images.shape[1]):
out_images[:, idx] = images[:, idx] + rgb[2 - idx]
return out_images
def color_normalization(images, mean, stddev):
"""
Perform color nomration on the given images.
Args:
images (tensor): images to perform color normalization. Dimension is
`num frames` x `channel` x `height` x `width`.
mean (list): mean values for normalization.
stddev (list): standard deviations for normalization.
Returns:
out_images (tensor): the noramlized images, the dimension is
`num frames` x `channel` x `height` x `width`.
"""
assert len(mean) == images.shape[1], "channel mean not computed properly"
assert (
len(stddev) == images.shape[1]
), "channel stddev not computed properly"
out_images = torch.zeros_like(images)
for idx in range(len(mean)):
out_images[:, idx] = (images[:, idx] - mean[idx]) / stddev[idx]
return out_images
class RawVideoExtractorCV2():
def __init__(self, centercrop=False, size=224, framerate=-1, ):
self.centercrop = centercrop
self.size = size
self.framerate = framerate
self.transform = self._transform(self.size)
# Normalize((0.48145466, 0.4578275, 0.40821073), ),
self.mean = np.array((0.48145466, 0.4578275, 0.40821073)).reshape(1, 1, 1, 3) * 255
self.std = np.array((0.26862954, 0.26130258, 0.27577711)).reshape(1, 1, 1, 3) * 255
def _transform(self, n_px):
return Compose([
Resize(n_px, interpolation=Image.BICUBIC),
CenterCrop(n_px),
lambda image: image.convert("RGB"),
ToTensor(),
Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)),
])
def video_to_tensor(self, video_file, preprocess, sample_fp=0, num_frames=50, sample_offset=0, start_time=None, end_time=None, impl="pyav"):
if start_time is not None or end_time is not None:
assert isinstance(start_time, int) and isinstance(end_time, int) \
and start_time > -1 and end_time > start_time
# assert sample_fp > -1
if impl == "cv2":
# Samples a frame sample_fp X frames.
cap = cv2.VideoCapture(video_file)
frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
total_duration = (frameCount + fps - 1) // fps
start_sec, end_sec = 0, total_duration
if start_time is not None:
start_sec, end_sec = start_time, end_time if end_time <= total_duration else total_duration
cap.set(cv2.CAP_PROP_POS_FRAMES, int(start_time * fps))
ret = True
images, included = [], []
if sample_fp > -1:
if impl == "cv2":
# sample by fixed interval
interval = 1
if sample_fp > 0:
interval = fps // sample_fp
else:
sample_fp = fps
if interval == 0: interval = 1
inds = [ind for ind in np.arange(0, fps, interval)]
assert len(inds) >= sample_fp
inds = inds[:sample_fp]
offset = min(sample_offset, interval - 1) if sample_offset > 0 else random.randint(0, interval - 1)
for sec in np.arange(start_sec, end_sec + 1):
if not ret: break
# sec_base = int(sec * fps)
sec_base = int(sec * fps + offset)
for ind in inds:
cap.set(cv2.CAP_PROP_POS_FRAMES, sec_base + ind)
ret, frame = cap.read()
if not ret: break
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
images.append(preprocess(Image.fromarray(frame_rgb).convert("RGB")))
if len(images) > 0:
video_data = torch.tensor(np.stack(images))
else:
video_data = torch.zeros(1)
cap.release()
elif impl == "ffmpeg":
if sample_fp == 0:
sample_fp = 1000 # sample every frame
out, _ = (
ffmpeg
.input(video_file)
.filter('select', 'isnan(prev_selected_t)+gte(t-prev_selected_t,{})'.format(1 / sample_fp))
.filter('crop', 'min(in_w, in_h)', 'min(in_w, in_h)', '(in_w - min(in_w, in_h)) / 2', '(in_h - min(in_w, in_h)) / 2') # w, h, x, y, center crop
.filter('scale', self.size, self.size) # resize
.output('pipe:', format='rawvideo', pix_fmt='rgb24', vsync='vfr')
.global_args('-loglevel', 'quiet')
.run(capture_stdout=True)
)
video = (
np
.frombuffer(out, np.uint8)
.reshape([-1, self.size, self.size, 3])
)
video = (video - self.mean) / self.std
video_data = torch.as_tensor(video).permute(0, 3, 1, 2)
elif impl == 'pyav':
images = list()
container = av.open(video_file)
container.streams.video[0].thread_type = "AUTO"
stream = container.streams.video[0]
total_frames = stream.frames
assert total_frames != 0
duration = int(stream.duration * stream.time_base)
if sample_fp > 0:
interval = max(int(total_frames / duration / sample_fp), 1)
else:
interval = 1
for frame in container.decode(stream):
if frame.index % interval != 0:
continue
images.append(preprocess(frame.to_rgb().to_image()))
if len(images) > 0:
video_data = torch.stack(images) # th.tensor(np.stack(images))
else:
video_data = torch.zeros(1)
container.close()
else:
raise NotImplementedError
else:
if impl == "cv2":
# sample fixed number of frames
interval = max(frameCount // num_frames, 1) # this interval is int
start = min(sample_offset, interval - 1) if sample_offset > -1 else random.randint(0, interval - 1)
interval = frameCount / num_frames # the second interval is float
for i in range(num_frames):
cap.set(cv2.CAP_PROP_POS_FRAMES, start + int(i * interval))
ret, frame = cap.read()
if not ret: break
frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
images.append(preprocess(Image.fromarray(frame_rgb).convert("RGB")))
if len(images) > 0:
video_data = torch.tensor(np.stack(images))
else:
video_data = torch.zeros(1)
cap.release()
elif impl == "pyav":
images = list()
container = av.open(video_file)
container.streams.video[0].thread_type = "AUTO"
stream = container.streams.video[0]
total_frames = stream.frames
assert total_frames != 0
interval = max(total_frames // num_frames, 1) # this interval is int
for frame in container.decode(stream):
if frame.index % interval != 0:
continue
images.append(preprocess(frame.to_rgb().to_image()))
if len(images) > 0:
video_data = torch.stack(images) # th.tensor(np.stack(images))
else:
video_data = torch.zeros(1)
container.close()
else:
raise NotImplementedError
return {'video': video_data}
def get_video_data(self, video_path, num_frames, sample_offset, start_time=None, end_time=None):
image_input = self.video_to_tensor(video_path, self.transform, sample_fp=self.framerate, num_frames=num_frames, sample_offset=sample_offset, start_time=start_time, end_time=end_time)
return image_input
def process_raw_data(self, raw_video_data):
tensor_size = raw_video_data.size()
tensor = raw_video_data.view(-1, 1, tensor_size[-3], tensor_size[-2], tensor_size[-1])
return tensor
def process_frame_order(self, raw_video_data, frame_order=0):
# 0: ordinary order; 1: reverse order; 2: random order.
if frame_order == 0:
pass
elif frame_order == 1:
reverse_order = np.arange(raw_video_data.size(0) - 1, -1, -1)
raw_video_data = raw_video_data[reverse_order, ...]
elif frame_order == 2:
random_order = np.arange(raw_video_data.size(0))
np.random.shuffle(random_order)
raw_video_data = raw_video_data[random_order, ...]
return raw_video_data
# An ordinary video frame extractor based CV2
RawVideoExtractor = RawVideoExtractorCV2