|
|
|
|
|
|
|
|
|
import numbers |
|
import random |
|
import torch |
|
|
|
try: |
|
import accimage |
|
except: |
|
pass |
|
|
|
from torchvision.transforms import ( |
|
RandomResizedCrop, |
|
) |
|
|
|
from . import functional_video as F |
|
|
|
def _get_image_size(img): |
|
if isinstance(img, torch.Tensor) and img.dim() > 2: |
|
return img.shape[-2:][::-1] |
|
else: |
|
raise TypeError("Unexpected type {}".format(type(img))) |
|
|
|
class RandomCrop(object): |
|
"""Crop the given PIL Image at a random location. |
|
Args: |
|
size (sequence or int): Desired output size of the crop. If size is an |
|
int instead of sequence like (h, w), a square crop (size, size) is |
|
made. |
|
padding (int or sequence, optional): Optional padding on each border |
|
of the image. Default is None, i.e no padding. If a sequence of length |
|
4 is provided, it is used to pad left, top, right, bottom borders |
|
respectively. If a sequence of length 2 is provided, it is used to |
|
pad left/right, top/bottom borders, respectively. |
|
pad_if_needed (boolean): It will pad the image if smaller than the |
|
desired size to avoid raising an exception. Since cropping is done |
|
after padding, the padding seems to be done at a random offset. |
|
fill: Pixel fill value for constant fill. Default is 0. If a tuple of |
|
length 3, it is used to fill R, G, B channels respectively. |
|
This value is only used when the padding_mode is constant |
|
padding_mode: Type of padding. Should be: constant, edge, reflect or symmetric. Default is constant. |
|
- constant: pads with a constant value, this value is specified with fill |
|
- edge: pads with the last value on the edge of the image |
|
- reflect: pads with reflection of image (without repeating the last value on the edge) |
|
padding [1, 2, 3, 4] with 2 elements on both sides in reflect mode |
|
will result in [3, 2, 1, 2, 3, 4, 3, 2] |
|
- symmetric: pads with reflection of image (repeating the last value on the edge) |
|
padding [1, 2, 3, 4] with 2 elements on both sides in symmetric mode |
|
will result in [2, 1, 1, 2, 3, 4, 4, 3] |
|
""" |
|
|
|
def __init__(self, size, padding=None, pad_if_needed=False, fill=0, padding_mode='constant'): |
|
if isinstance(size, numbers.Number): |
|
self.size = (int(size), int(size)) |
|
else: |
|
self.size = size |
|
self.padding = padding |
|
self.pad_if_needed = pad_if_needed |
|
self.fill = fill |
|
self.padding_mode = padding_mode |
|
|
|
@staticmethod |
|
def get_params(img, output_size): |
|
"""Get parameters for ``crop`` for a random crop. |
|
Args: |
|
img (PIL Image): Image to be cropped. |
|
output_size (tuple): Expected output size of the crop. |
|
Returns: |
|
tuple: params (i, j, h, w) to be passed to ``crop`` for random crop. |
|
""" |
|
w, h = _get_image_size(img) |
|
th, tw = output_size |
|
if w == tw and h == th: |
|
return 0, 0, h, w |
|
|
|
i = random.randint(0, h - th) |
|
j = random.randint(0, w - tw) |
|
return i, j, th, tw |
|
|
|
def __call__(self, img): |
|
""" |
|
Args: |
|
img (PIL Image): Image to be cropped. |
|
Returns: |
|
PIL Image: Cropped image. |
|
""" |
|
if self.padding is not None: |
|
img = F.pad(img, self.padding, self.fill, self.padding_mode) |
|
|
|
|
|
if self.pad_if_needed and img.size[0] < self.size[1]: |
|
img = F.pad(img, (self.size[1] - img.size[0], 0), self.fill, self.padding_mode) |
|
|
|
if self.pad_if_needed and img.size[1] < self.size[0]: |
|
img = F.pad(img, (0, self.size[0] - img.size[1]), self.fill, self.padding_mode) |
|
|
|
i, j, h, w = self.get_params(img, self.size) |
|
|
|
return F.crop(img, i, j, h, w) |
|
|
|
def __repr__(self): |
|
return self.__class__.__name__ + '(size={0}, padding={1})'.format(self.size, self.padding) |
|
|
|
|
|
|
|
|
|
|
|
class RandomCropVideo(RandomCrop): |
|
def __init__(self, size): |
|
if isinstance(size, numbers.Number): |
|
self.size = (int(size), int(size)) |
|
else: |
|
self.size = size |
|
|
|
def __call__(self, clip): |
|
""" |
|
Args: |
|
clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) |
|
Returns: |
|
torch.tensor: randomly cropped/resized video clip. |
|
size is (C, T, OH, OW) |
|
""" |
|
i, j, h, w = self.get_params(clip, self.size) |
|
return F.crop(clip, i, j, h, w) |
|
|
|
def __repr__(self): |
|
return self.__class__.__name__ + '(size={0})'.format(self.size) |
|
|
|
|
|
class RandomResizedCropVideo(RandomResizedCrop): |
|
def __init__( |
|
self, |
|
size, |
|
scale=(0.08, 1.0), |
|
ratio=(3.0 / 4.0, 4.0 / 3.0), |
|
interpolation_mode="bilinear", |
|
): |
|
if isinstance(size, tuple): |
|
assert len(size) == 2, "size should be tuple (height, width)" |
|
self.size = size |
|
else: |
|
self.size = (size, size) |
|
|
|
self.interpolation_mode = interpolation_mode |
|
self.scale = scale |
|
self.ratio = ratio |
|
|
|
def __call__(self, clip): |
|
""" |
|
Args: |
|
clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) |
|
Returns: |
|
torch.tensor: randomly cropped/resized video clip. |
|
size is (C, T, H, W) |
|
""" |
|
i, j, h, w = self.get_params(clip, self.scale, self.ratio) |
|
return F.resized_crop(clip, i, j, h, w, self.size, self.interpolation_mode) |
|
|
|
def __repr__(self): |
|
return self.__class__.__name__ + \ |
|
'(size={0}, interpolation_mode={1}, scale={2}, ratio={3})'.format( |
|
self.size, self.interpolation_mode, self.scale, self.ratio |
|
) |
|
|
|
|
|
class CenterCropVideo(object): |
|
def __init__(self, crop_size): |
|
if isinstance(crop_size, numbers.Number): |
|
self.crop_size = (int(crop_size), int(crop_size)) |
|
else: |
|
self.crop_size = crop_size |
|
|
|
|
|
def __call__(self, clip): |
|
""" |
|
Args: |
|
clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) |
|
Returns: |
|
torch.tensor: central cropping of video clip. Size is |
|
(C, T, crop_size, crop_size) |
|
""" |
|
|
|
return F.center_crop(clip, self.crop_size) |
|
|
|
def __repr__(self): |
|
return self.__class__.__name__ + '(crop_size={0})'.format(self.crop_size) |
|
|
|
class CornerCropVideo(object): |
|
def __init__(self, crop_size, loc="tr"): |
|
if isinstance(crop_size, numbers.Number): |
|
self.crop_size = (int(crop_size), int(crop_size)) |
|
else: |
|
self.crop_size = crop_size |
|
|
|
def __call__(self, clip, loc="tr"): |
|
""" |
|
Args: |
|
clip (torch.tensor): Video clip to be cropped. Size is (C, T, H, W) |
|
Returns: |
|
torch.tensor: central cropping of video clip. Size is |
|
(C, T, crop_size, crop_size) |
|
""" |
|
if loc == "tr": |
|
i = 0 |
|
j = 0 |
|
elif loc == "center": |
|
return F.corner_crop(clip, self.crop_size) |
|
else: |
|
i = clip.size(-2) - self.crop_size |
|
j = clip.size(-1) - self.crop_size |
|
return F.corner_crop(clip, self.crop_size, i, j) |
|
|
|
def __repr__(self): |
|
return self.__class__.__name__ + '(crop_size={0})'.format(self.crop_size) |
|
|
|
|
|
class NormalizeVideo(object): |
|
""" |
|
Normalize the video clip by mean subtraction and division by standard deviation |
|
Args: |
|
mean (3-tuple): pixel RGB mean |
|
std (3-tuple): pixel RGB standard deviation |
|
inplace (boolean): whether do in-place normalization |
|
""" |
|
|
|
def __init__(self, mean, std, inplace=False): |
|
self.mean = mean |
|
self.std = std |
|
self.inplace = inplace |
|
|
|
def __call__(self, clip): |
|
""" |
|
Args: |
|
clip (torch.tensor): video clip to be normalized. Size is (C, T, H, W) |
|
""" |
|
return F.normalize(clip, self.mean, self.std, self.inplace) |
|
|
|
def __repr__(self): |
|
return self.__class__.__name__ + '(mean={0}, std={1}, inplace={2})'.format( |
|
self.mean, self.std, self.inplace) |
|
|
|
|
|
class ToTensorVideo(object): |
|
""" |
|
Convert tensor data type from uint8 to float, divide value by 255.0 and |
|
permute the dimenions of clip tensor |
|
""" |
|
|
|
def __init__(self): |
|
pass |
|
|
|
def __call__(self, clip): |
|
""" |
|
Args: |
|
clip (torch.tensor, dtype=torch.uint8): Size is (T, H, W, C) |
|
Return: |
|
clip (torch.tensor, dtype=torch.float): Size is (C, T, H, W) |
|
""" |
|
return F.to_tensor(clip) |
|
|
|
def __repr__(self): |
|
return self.__class__.__name__ |
|
|
|
|
|
class RandomHorizontalFlipVideo(object): |
|
""" |
|
Flip the video clip along the horizonal direction with a given probability |
|
Args: |
|
p (float): probability of the clip being flipped. Default value is 0.5 |
|
""" |
|
|
|
def __init__(self, p=0.5): |
|
self.p = p |
|
|
|
def __call__(self, clip): |
|
""" |
|
Args: |
|
clip (torch.tensor): Size is (C, T, H, W) |
|
Return: |
|
clip (torch.tensor): Size is (C, T, H, W) |
|
""" |
|
if random.random() < self.p: |
|
clip = F.hflip(clip) |
|
return clip |
|
|
|
def __repr__(self): |
|
return self.__class__.__name__ + "(p={0})".format(self.p) |
|
|
|
|
|
|
|
class ResizeVideo(object): |
|
""" |
|
Resize the video clip |
|
""" |
|
def __init__(self, w,h): |
|
self.w = w |
|
self.h = h |
|
def __call__(self, clip): |
|
""" |
|
Args: |
|
clip (torch.tensor): Size is (C, T, H, W) |
|
Return: |
|
clip (torch.tensor): Size is (C, T, h, w) |
|
""" |
|
|
|
return torch.nn.functional.interpolate( |
|
clip.permute(1,0,2,3),(self.h,self.w),mode="bilinear",align_corners=False).permute(1,0,2,3) |
|
|
|
def __repr__(self): |
|
return self.__class__.__name__ + "(w=%d,h=%d)"%(self.w,self.h) |