import numpy as np import torch from torchvision import transforms import av import logging import base64 import io logging.basicConfig(filename='/mnt/data/uploads/logfile-video.log', level=logging.INFO) def read_video(video_base64, num_frames=24, target_size=(224, 224)): video_data = base64.b64decode(video_base64) container = av.open(io.BytesIO(video_data)) frames = [] for frame in container.decode(video=0): frames.append(frame.to_ndarray(format="rgb24").astype(np.uint8)) sampled_frames = sample_frames(frames, num_frames) processed_frames = pad_and_resize(sampled_frames, target_size) processed_frames = processed_frames.permute(1, 0, 2, 3) # (T, C, H, W) -> (C, T, H, W) return processed_frames def sample_frames(frames, num_frames): total_frames = len(frames) sampled_frames = list(frames) if total_frames <= num_frames: # sampled_frames = frames if total_frames < num_frames: padding = [np.zeros_like(frames[0]) for _ in range(num_frames - total_frames)] sampled_frames.extend(padding) else: indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int) sampled_frames = [frames[i] for i in indices] return np.array(sampled_frames) # total_frames = len(frames) # if total_frames <= num_frames: # if total_frames < num_frames: # padding = [np.zeros_like(frames[0]) for _ in range(num_frames - total_frames)] # frames.extend(padding) # else: # indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int) # frames = [frames[i] for i in indices] # return np.array(frames) def pad_and_resize(frames, target_size): transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize(target_size), transforms.ToTensor() ]) processed_frames = [transform(frame) for frame in frames] return torch.stack(processed_frames) # def pad_and_resize(frames, target_size): # transform = transforms.Compose([ # transforms.ToPILImage(), # transforms.Resize(target_size), # transforms.ToTensor() # ]) # processed_frames = [transform(frame) for frame in frames] # return torch.stack(processed_frames).permute(1, 0, 2, 3) # (T, C, H, W) -> (C, T, H, W)