hma / datasets /extern /egoexo4d.py
LeroyWaa's picture
draft
246c106
raw
history blame
7.22 kB
# --------------------------------------------------------
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes/blob/master/notebooks/demo.ipynb
import os
from typing import Iterable
import numpy as np
from tqdm import tqdm
from collections import OrderedDict
import os
import numpy as np
from pathlib import Path
CURRENT_DIR = os.path.dirname(__file__)
import cv2
from os.path import expanduser
import json
# Adjust these to the where-ever your detections and frames are stored.
CAM = "cam01" # cam01
ROOT = "/datasets01/egoexo4d/v2/"
LABEL_ROOT = ROOT + "annotations/ego_pose/train/hand/automatic/{}.json"
VIDEO_PATH = ROOT + "takes/{}/frame_aligned_videos/{}.mp4"
# from epic_kitchens.hoa import load_detections
TAKE_ROOT = ROOT + "takes.json"
def compute_state_and_actions(image, curr_frame, next_frame, idx, save=False):
img_width, img_height = image.shape[1], image.shape[0]
# already normalized
curr_hand1_center = curr_frame[0]['annotation2D'][CAM]['left_wrist']
curr_hand2_center = curr_frame[0]['annotation2D'][CAM]['right_wrist']
# normalized them
curr_hand1_center = np.array([curr_hand1_center['x'] / img_width, curr_hand1_center['y'] / img_height])
curr_hand2_center = np.array([curr_hand2_center['x'] / img_width, curr_hand2_center['y'] / img_height])
next_hand1_center = next_frame[0]['annotation2D'][CAM]['left_wrist']
next_hand2_center = next_frame[0]['annotation2D'][CAM]['right_wrist']
# normalize them
next_hand1_center = np.array([next_hand1_center['x'] / img_width, next_hand1_center['y'] / img_height])
next_hand2_center = np.array([next_hand2_center['x'] / img_width, next_hand2_center['y'] / img_height])
state = np.concatenate((curr_hand1_center, curr_hand2_center)) # - np.array(curr_hand1_center) - np.array(curr_hand2_center)
action = np.concatenate(
(
np.array(next_hand1_center),
np.array(next_hand2_center),
)
)
if save:
# draw the bounding boxes
cv2.circle(image, (int(curr_hand1_center[0] * img_width), int(curr_hand1_center[1] * img_height)), 10, (0, 255, 0), -1)
cv2.circle(image, (int(curr_hand2_center[0] * img_width), int(curr_hand2_center[1] * img_height)), 10, (0, 255, 0), -1)
cv2.circle(image, (int(next_hand1_center[0] * img_width), int(next_hand1_center[1] * img_height)), 10, (0, 0, 255), -1)
cv2.circle(image, (int(next_hand2_center[0] * img_width), int(next_hand2_center[1] * img_height)), 10, (0, 0, 255), -1)
# save the image
cv2.imwrite(f"output/inspect/test_{idx}.png", image)
return state, action
def parse_raw_video(video_path):
import cv2
cap = cv2.VideoCapture(video_path)
frames = []
while cap.isOpened():
ret, frame = cap.read()
if not ret:
break
frames.append(frame)
return frames
def egoexo4d_dataset_size() -> int:
""" Returns the number of takes in the dataset. ~5k for v2. """
takes = json.load(open(TAKE_ROOT))
return len(takes)
# define your own dataset conversion
def egoexo4d_dataset_generator(example_inds: Iterable[int] = None):
"""
Generator yielding data from Ego-Exo4D.
Args:
example_inds: if specified, will only yield data from these indices.
Otherwise, will default to yielding the entire dataset.
"""
# convert to a list of episodes that can be added to replay buffer
MAX_EPISODE_LENGTH = 5000
TAKE_FILE = json.load(open(TAKE_ROOT))
print("total takes", len(TAKE_FILE))
# find the first camera with aria
global CAM
def find_aria_name(take):
for cam in take['cameras']:
if 'aria' in cam['name']:
return cam['name']
return None
if example_inds is None:
example_inds = range(len(TAKE_FILE))
for example_ind in example_inds:
take = TAKE_FILE[example_ind]
take_name = take['take_name']
take_uid = take['take_uid']
# CAM = find_aria_name(take)
# if CAM is None:
# continue
video_path = VIDEO_PATH.format(take_name, CAM)
label_path = LABEL_ROOT.format(take_uid)
if not os.path.exists(video_path) or not os.path.exists(label_path):
continue
video_frames = parse_raw_video(video_path)
label_detections = json.load(open(label_path))
print("video_path:", video_path)
print("len video frames", len(video_frames))
print("len label detections", len(label_detections))
# action extractions over bounding boxes subtractions of both hands.
max_frame_idx = len(video_frames) - 1
DS_FACTOR = 1
frame_idx = 0
start_frame_idx = 0
MIN_CLIP_LENGTH = 300
def get_continuous_chunk(start_idx, label_detections):
end_idx = start_idx + 1
while str(start_idx) in label_detections and len(label_detections[str(start_idx)]) > 0 and str(end_idx) in label_detections and len(label_detections[str(end_idx)]) > 0:
end_idx += 1
return end_idx
print("TAKE", take_name)
# some frames might not have label. if there is a gap, skip
while start_frame_idx < max_frame_idx - DS_FACTOR:
# print(video_detections[frame_idx].hands)
lang = "use human hands to do some tasks" # dummies
if str(start_frame_idx) not in label_detections or str(start_frame_idx + DS_FACTOR) not in label_detections:
start_frame_idx += DS_FACTOR
continue
end_frame_idx = get_continuous_chunk(start_frame_idx, label_detections)
# print("start_frame_idx", start_frame_idx, end_frame_idx)
if end_frame_idx - start_frame_idx < MIN_CLIP_LENGTH:
start_frame_idx = end_frame_idx
continue
print("start clipping from", start_frame_idx, "to", end_frame_idx)
steps = []
for frame_idx in range(start_frame_idx, end_frame_idx - DS_FACTOR, DS_FACTOR):
image = video_frames[frame_idx][...,[2,1,0]] # RGB
try:
s, a = compute_state_and_actions(
image,
label_detections[str(frame_idx)], label_detections[str(frame_idx + DS_FACTOR)],
frame_idx, save=False
)
except:
break
# break into step dict
step = {
"observation": {"image": image, "state": s},
"action": a,
"language_instruction": lang,
}
steps.append(OrderedDict(step))
if len(steps) > MAX_EPISODE_LENGTH:
break
start_frame_idx = end_frame_idx
if len(steps) < MIN_CLIP_LENGTH:
data_dict = {"steps": steps}
print(f"max_frame_idx: {max_frame_idx} ds factor: {DS_FACTOR} {len(steps)}")
yield data_dict