File size: 5,924 Bytes
b84549f |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
import os
from pathlib import Path
import random
import numpy as np
import pickle as pk
import cv2
from tqdm import tqdm
from PIL import Image
import torchvision.transforms as transforms
import torch
# from prefetch_generator import BackgroundGenerator
from torch.utils.data import DataLoader, Dataset
class VideoDataset(Dataset):
def __init__(self, directory_list, local_rank=0, enable_GPUs_num=0, distributed_load=False, resize_shape=[224, 224] , mode='train', clip_len=32, crop_size = 168):
self.clip_len, self.crop_size, self.resize_shape = clip_len, crop_size, resize_shape
self.mode = mode
self.fnames, labels = [],[]
# get the directory of the specified split
for directory in directory_list:
folder = Path(directory)
print("Load dataset from folder : ", folder)
for label in sorted(os.listdir(folder)):
for fname in os.listdir(os.path.join(folder, label)) if mode=="train" else os.listdir(os.path.join(folder, label))[:10]:
self.fnames.append(os.path.join(folder, label, fname))
labels.append(label)
# print(labels)
random_list = list(zip(self.fnames, labels))
random.shuffle(random_list)
self.fnames[:], labels[:] = zip(*random_list)
self.labels = labels
# self.fnames = self.fnames[:240]
if mode == 'train' and distributed_load:
single_num_ = len(self.fnames)//enable_GPUs_num
self.fnames = self.fnames[local_rank*single_num_:((local_rank+1)*single_num_)]
labels = labels[local_rank*single_num_:((local_rank+1)*single_num_)]
# prepare a mapping between the label names (strings) and indices (ints)
self.label2index = {label:index for index, label in enumerate(sorted(set(labels)))}
# convert the list of label names into an array of label indices
self.label_array = np.array([self.label2index[label] for label in labels], dtype=int)
def __getitem__(self, index):
# loading and preprocessing. TODO move them to transform classess
buffer = self.loadvideo(self.fnames[index])
height_index = np.random.randint(buffer.shape[2] - self.crop_size)
width_index = np.random.randint(buffer.shape[3] - self.crop_size)
return buffer[:,:,height_index:height_index + self.crop_size, width_index:width_index + self.crop_size], self.label_array[index]
def __len__(self):
return len(self.fnames)
def loadvideo(self, fname):
# initialize a VideoCapture object to read video data into a numpy array
self.transform = transforms.Compose([
transforms.Resize([self.resize_shape[0], self.resize_shape[1]]),
transforms.ToTensor(),
transforms.Normalize([0.5, 0.5, 0.5], [0.5, 0.5, 0.5])
])
flip, flipCode = 1, random.choice([-1,0,1]) if np.random.random() < 0.5 and self.mode=="train" else 0
try:
video_stream = cv2.VideoCapture(fname)
frame_count = int(video_stream.get(cv2.CAP_PROP_FRAME_COUNT))
except RuntimeError:
index = np.random.randint(self.__len__())
video_stream = cv2.VideoCapture(self.fnames[index])
frame_count = int(video_stream.get(cv2.CAP_PROP_FRAME_COUNT))
while frame_count<self.clip_len+2:
index = np.random.randint(self.__len__())
video_stream = cv2.VideoCapture(self.fnames[index])
frame_count = int(video_stream.get(cv2.CAP_PROP_FRAME_COUNT))
speed_rate = np.random.randint(1, 3) if frame_count > self.clip_len*2+2 else 1
time_index = np.random.randint(frame_count - self.clip_len * speed_rate)
start_idx, end_idx, final_idx = time_index, time_index+(self.clip_len*speed_rate), frame_count-1
count, sample_count, retaining = 0, 0, True
# create a buffer. Must have dtype float, so it gets converted to a FloatTensor by Pytorch later
buffer = np.empty((self.clip_len, 3, self.resize_shape[0], self.resize_shape[1]), np.dtype('float32'))
while (count <= end_idx and retaining):
retaining, frame = video_stream.read()
if count < start_idx:
count += 1
continue
if count % speed_rate == speed_rate-1 and count >= start_idx and sample_count < self.clip_len:
if flip:
frame = cv2.flip(frame, flipCode=flipCode)
try:
buffer[sample_count] = self.transform(Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)))
except cv2.error as err:
continue
sample_count += 1
count += 1
video_stream.release()
return buffer.transpose((1, 0, 2, 3))
if __name__ == '__main__':
datapath = ['/data/datasets/ucf101/videos']
dataset = VideoDataset(datapath,
resize_shape=[224, 224],
mode='validation')
x, y = dataset[0]
# x: (3, num_frames, w, h)
print(x.shape, y.shape, y)
# dataloader = DataLoader(dataset, batch_size=8, shuffle=True, num_workers=24, pin_memory=True)
# bar = tqdm(total=len(dataloader), ncols=80)
# prefetcher = DataPrefetcher(BackgroundGenerator(dataloader), 0)
# batch = prefetcher.next()
# iter_id = 0
# while batch is not None:
# iter_id += 1
# bar.update(1)
# if iter_id >= len(dataloader):
# break
# batch = prefetcher.next()
# print(batch[0].shape)
# print("label: ", batch[1])
# '''
# for step, (buffer, labels) in enumerate(BackgroundGenerator(dataloader)):
# print(buffer.shape)
# print("label: ", labels)
# bar.update(1)
# '''
|