File size: 7,218 Bytes
246c106 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 |
# --------------------------------------------------------
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import os
from typing import Iterable
import numpy as np
from tqdm import tqdm
from collections import OrderedDict
import os
import numpy as np
from pathlib import Path
CURRENT_DIR = os.path.dirname(__file__)
import cv2
from os.path import expanduser
import json
# Adjust these to the where-ever your detections and frames are stored.
CAM = "cam01" # cam01
ROOT = "/datasets01/egoexo4d/v2/"
LABEL_ROOT = ROOT + "annotations/ego_pose/train/hand/automatic/{}.json"
VIDEO_PATH = ROOT + "takes/{}/frame_aligned_videos/{}.mp4"
# from epic_kitchens.hoa import load_detections
TAKE_ROOT = ROOT + "takes.json"
def compute_state_and_actions(image, curr_frame, next_frame, idx, save=False):
img_width, img_height = image.shape[1], image.shape[0]
# already normalized
curr_hand1_center = curr_frame[0]['annotation2D'][CAM]['left_wrist']
curr_hand2_center = curr_frame[0]['annotation2D'][CAM]['right_wrist']
# normalized them
curr_hand1_center = np.array([curr_hand1_center['x'] / img_width, curr_hand1_center['y'] / img_height])
curr_hand2_center = np.array([curr_hand2_center['x'] / img_width, curr_hand2_center['y'] / img_height])
next_hand1_center = next_frame[0]['annotation2D'][CAM]['left_wrist']
next_hand2_center = next_frame[0]['annotation2D'][CAM]['right_wrist']
# normalize them
next_hand1_center = np.array([next_hand1_center['x'] / img_width, next_hand1_center['y'] / img_height])
next_hand2_center = np.array([next_hand2_center['x'] / img_width, next_hand2_center['y'] / img_height])
state = np.concatenate((curr_hand1_center, curr_hand2_center)) # - np.array(curr_hand1_center) - np.array(curr_hand2_center)
action = np.concatenate(
if save:
# draw the bounding boxes, (int(curr_hand1_center[0] * img_width), int(curr_hand1_center[1] * img_height)), 10, (0, 255, 0), -1), (int(curr_hand2_center[0] * img_width), int(curr_hand2_center[1] * img_height)), 10, (0, 255, 0), -1), (int(next_hand1_center[0] * img_width), int(next_hand1_center[1] * img_height)), 10, (0, 0, 255), -1), (int(next_hand2_center[0] * img_width), int(next_hand2_center[1] * img_height)), 10, (0, 0, 255), -1)
# save the image
cv2.imwrite(f"output/inspect/test_{idx}.png", image)
return state, action
def parse_raw_video(video_path):
import cv2
cap = cv2.VideoCapture(video_path)
frames = []
while cap.isOpened():
ret, frame =
if not ret:
return frames
def egoexo4d_dataset_size() -> int:
""" Returns the number of takes in the dataset. ~5k for v2. """
takes = json.load(open(TAKE_ROOT))
return len(takes)
# define your own dataset conversion
def egoexo4d_dataset_generator(example_inds: Iterable[int] = None):
Generator yielding data from Ego-Exo4D.
example_inds: if specified, will only yield data from these indices.
Otherwise, will default to yielding the entire dataset.
# convert to a list of episodes that can be added to replay buffer
TAKE_FILE = json.load(open(TAKE_ROOT))
print("total takes", len(TAKE_FILE))
# find the first camera with aria
global CAM
def find_aria_name(take):
for cam in take['cameras']:
if 'aria' in cam['name']:
return cam['name']
return None
if example_inds is None:
example_inds = range(len(TAKE_FILE))
for example_ind in example_inds:
take = TAKE_FILE[example_ind]
take_name = take['take_name']
take_uid = take['take_uid']
# CAM = find_aria_name(take)
# if CAM is None:
# continue
video_path = VIDEO_PATH.format(take_name, CAM)
label_path = LABEL_ROOT.format(take_uid)
if not os.path.exists(video_path) or not os.path.exists(label_path):
video_frames = parse_raw_video(video_path)
label_detections = json.load(open(label_path))
print("video_path:", video_path)
print("len video frames", len(video_frames))
print("len label detections", len(label_detections))
# action extractions over bounding boxes subtractions of both hands.
max_frame_idx = len(video_frames) - 1
frame_idx = 0
start_frame_idx = 0
def get_continuous_chunk(start_idx, label_detections):
end_idx = start_idx + 1
while str(start_idx) in label_detections and len(label_detections[str(start_idx)]) > 0 and str(end_idx) in label_detections and len(label_detections[str(end_idx)]) > 0:
end_idx += 1
return end_idx
print("TAKE", take_name)
# some frames might not have label. if there is a gap, skip
while start_frame_idx < max_frame_idx - DS_FACTOR:
# print(video_detections[frame_idx].hands)
lang = "use human hands to do some tasks" # dummies
if str(start_frame_idx) not in label_detections or str(start_frame_idx + DS_FACTOR) not in label_detections:
start_frame_idx += DS_FACTOR
end_frame_idx = get_continuous_chunk(start_frame_idx, label_detections)
# print("start_frame_idx", start_frame_idx, end_frame_idx)
if end_frame_idx - start_frame_idx < MIN_CLIP_LENGTH:
start_frame_idx = end_frame_idx
print("start clipping from", start_frame_idx, "to", end_frame_idx)
steps = []
for frame_idx in range(start_frame_idx, end_frame_idx - DS_FACTOR, DS_FACTOR):
image = video_frames[frame_idx][...,[2,1,0]] # RGB
s, a = compute_state_and_actions(
label_detections[str(frame_idx)], label_detections[str(frame_idx + DS_FACTOR)],
frame_idx, save=False
# break into step dict
step = {
"observation": {"image": image, "state": s},
"action": a,
"language_instruction": lang,
if len(steps) > MAX_EPISODE_LENGTH:
start_frame_idx = end_frame_idx
if len(steps) < MIN_CLIP_LENGTH:
data_dict = {"steps": steps}
print(f"max_frame_idx: {max_frame_idx} ds factor: {DS_FACTOR} {len(steps)}")
yield data_dict