import numpy as np import torch from torchvision import transforms import av import logging import base64 import io logging.basicConfig(filename='/mnt/data/uploads/logfile-video.log', level=logging.INFO) def makeStack(video_base64): video_data = base64.b64decode(video_base64) container = av.open(io.BytesIO(video_data)) frames = [] for frame in container.decode(video=0): frames.append(frame.to_ndarray(format="rgb24").astype(np.uint8)) return np.stack(frames, axis=0) def read_video(video_base64, num_frames=24, target_size=(224, 224)): # video_data = base64.b64decode(video_base64) # container = av.open(io.BytesIO(video_data)) # frames = [] # for frame in container.decode(video=0): # frames.append(frame.to_ndarray(format="rgb24").astype(np.uint8)) # sampled_frames = sample_frames(frames, num_frames) # processed_frames = pad_and_resize(sampled_frames, target_size) # return processed_frames frames = makeStack(video_base64) frames = sample_frames(frames, num_frames) processed_frames = pad_and_resize(frames, target_size) return processed_frames def sample_frames(frames, num_frames): total_frames = len(frames) sampled_frames = list(frames) if total_frames <= num_frames: # sampled_frames = frames if total_frames < num_frames: padding = [np.zeros_like(frames[0]) for _ in range(num_frames - total_frames)] sampled_frames.extend(padding) else: indices = np.linspace(0, total_frames - 1, num=num_frames, dtype=int) sampled_frames = [frames[i] for i in indices] return np.array(sampled_frames) def pad_and_resize(frames, target_size): transform = transforms.Compose([ transforms.ToPILImage(), transforms.Resize(target_size), transforms.ToTensor() ]) processed_frames = [transform(frame) for frame in frames] processed_frames = torch.stack(processed_frames) # return processed_frames.permute(1, 0, 2, 3).unsqueeze(0) # Add batch dimension and permute [3, 24, 224, 224] # return processed_frames.permute(0, 2, 3, 1).unsqueeze(0) # [24, 224, 224, 3] return processed_frames.permute(0, 1, 2, 3).unsqueeze(0)