Spaces:
Running
on
Zero
Running
on
Zero
# -------------------------------------------------------- | |
# Licensed under The MIT License [see LICENSE for details] | |
# -------------------------------------------------------- | |
# https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes/blob/master/notebooks/demo.ipynb | |
import os | |
from typing import Iterable | |
import numpy as np | |
from tqdm import tqdm | |
from collections import OrderedDict | |
import os | |
import numpy as np | |
from pathlib import Path | |
CURRENT_DIR = os.path.dirname(__file__) | |
import cv2 | |
from os.path import expanduser | |
import json | |
# Adjust these to the where-ever your detections and frames are stored. | |
CAM = "cam01" # cam01 | |
ROOT = "/datasets01/egoexo4d/v2/" | |
LABEL_ROOT = ROOT + "annotations/ego_pose/train/hand/automatic/{}.json" | |
VIDEO_PATH = ROOT + "takes/{}/frame_aligned_videos/{}.mp4" | |
# from epic_kitchens.hoa import load_detections | |
TAKE_ROOT = ROOT + "takes.json" | |
def compute_state_and_actions(image, curr_frame, next_frame, idx, save=False): | |
img_width, img_height = image.shape[1], image.shape[0] | |
# already normalized | |
curr_hand1_center = curr_frame[0]['annotation2D'][CAM]['left_wrist'] | |
curr_hand2_center = curr_frame[0]['annotation2D'][CAM]['right_wrist'] | |
# normalized them | |
curr_hand1_center = np.array([curr_hand1_center['x'] / img_width, curr_hand1_center['y'] / img_height]) | |
curr_hand2_center = np.array([curr_hand2_center['x'] / img_width, curr_hand2_center['y'] / img_height]) | |
next_hand1_center = next_frame[0]['annotation2D'][CAM]['left_wrist'] | |
next_hand2_center = next_frame[0]['annotation2D'][CAM]['right_wrist'] | |
# normalize them | |
next_hand1_center = np.array([next_hand1_center['x'] / img_width, next_hand1_center['y'] / img_height]) | |
next_hand2_center = np.array([next_hand2_center['x'] / img_width, next_hand2_center['y'] / img_height]) | |
state = np.concatenate((curr_hand1_center, curr_hand2_center)) # - np.array(curr_hand1_center) - np.array(curr_hand2_center) | |
action = np.concatenate( | |
( | |
np.array(next_hand1_center), | |
np.array(next_hand2_center), | |
) | |
) | |
if save: | |
# draw the bounding boxes | |
cv2.circle(image, (int(curr_hand1_center[0] * img_width), int(curr_hand1_center[1] * img_height)), 10, (0, 255, 0), -1) | |
cv2.circle(image, (int(curr_hand2_center[0] * img_width), int(curr_hand2_center[1] * img_height)), 10, (0, 255, 0), -1) | |
cv2.circle(image, (int(next_hand1_center[0] * img_width), int(next_hand1_center[1] * img_height)), 10, (0, 0, 255), -1) | |
cv2.circle(image, (int(next_hand2_center[0] * img_width), int(next_hand2_center[1] * img_height)), 10, (0, 0, 255), -1) | |
# save the image | |
cv2.imwrite(f"output/inspect/test_{idx}.png", image) | |
return state, action | |
def parse_raw_video(video_path): | |
import cv2 | |
cap = cv2.VideoCapture(video_path) | |
frames = [] | |
while cap.isOpened(): | |
ret, frame = cap.read() | |
if not ret: | |
break | |
frames.append(frame) | |
return frames | |
def egoexo4d_dataset_size() -> int: | |
""" Returns the number of takes in the dataset. ~5k for v2. """ | |
takes = json.load(open(TAKE_ROOT)) | |
return len(takes) | |
# define your own dataset conversion | |
def egoexo4d_dataset_generator(example_inds: Iterable[int] = None): | |
""" | |
Generator yielding data from Ego-Exo4D. | |
Args: | |
example_inds: if specified, will only yield data from these indices. | |
Otherwise, will default to yielding the entire dataset. | |
""" | |
# convert to a list of episodes that can be added to replay buffer | |
MAX_EPISODE_LENGTH = 5000 | |
TAKE_FILE = json.load(open(TAKE_ROOT)) | |
print("total takes", len(TAKE_FILE)) | |
# find the first camera with aria | |
global CAM | |
def find_aria_name(take): | |
for cam in take['cameras']: | |
if 'aria' in cam['name']: | |
return cam['name'] | |
return None | |
if example_inds is None: | |
example_inds = range(len(TAKE_FILE)) | |
for example_ind in example_inds: | |
take = TAKE_FILE[example_ind] | |
take_name = take['take_name'] | |
take_uid = take['take_uid'] | |
# CAM = find_aria_name(take) | |
# if CAM is None: | |
# continue | |
video_path = VIDEO_PATH.format(take_name, CAM) | |
label_path = LABEL_ROOT.format(take_uid) | |
if not os.path.exists(video_path) or not os.path.exists(label_path): | |
continue | |
video_frames = parse_raw_video(video_path) | |
label_detections = json.load(open(label_path)) | |
print("video_path:", video_path) | |
print("len video frames", len(video_frames)) | |
print("len label detections", len(label_detections)) | |
# action extractions over bounding boxes subtractions of both hands. | |
max_frame_idx = len(video_frames) - 1 | |
DS_FACTOR = 1 | |
frame_idx = 0 | |
start_frame_idx = 0 | |
MIN_CLIP_LENGTH = 300 | |
def get_continuous_chunk(start_idx, label_detections): | |
end_idx = start_idx + 1 | |
while str(start_idx) in label_detections and len(label_detections[str(start_idx)]) > 0 and str(end_idx) in label_detections and len(label_detections[str(end_idx)]) > 0: | |
end_idx += 1 | |
return end_idx | |
print("TAKE", take_name) | |
# some frames might not have label. if there is a gap, skip | |
while start_frame_idx < max_frame_idx - DS_FACTOR: | |
# print(video_detections[frame_idx].hands) | |
lang = "use human hands to do some tasks" # dummies | |
if str(start_frame_idx) not in label_detections or str(start_frame_idx + DS_FACTOR) not in label_detections: | |
start_frame_idx += DS_FACTOR | |
continue | |
end_frame_idx = get_continuous_chunk(start_frame_idx, label_detections) | |
# print("start_frame_idx", start_frame_idx, end_frame_idx) | |
if end_frame_idx - start_frame_idx < MIN_CLIP_LENGTH: | |
start_frame_idx = end_frame_idx | |
continue | |
print("start clipping from", start_frame_idx, "to", end_frame_idx) | |
steps = [] | |
for frame_idx in range(start_frame_idx, end_frame_idx - DS_FACTOR, DS_FACTOR): | |
image = video_frames[frame_idx][...,[2,1,0]] # RGB | |
try: | |
s, a = compute_state_and_actions( | |
image, | |
label_detections[str(frame_idx)], label_detections[str(frame_idx + DS_FACTOR)], | |
frame_idx, save=False | |
) | |
except: | |
break | |
# break into step dict | |
step = { | |
"observation": {"image": image, "state": s}, | |
"action": a, | |
"language_instruction": lang, | |
} | |
steps.append(OrderedDict(step)) | |
if len(steps) > MAX_EPISODE_LENGTH: | |
break | |
start_frame_idx = end_frame_idx | |
if len(steps) < MIN_CLIP_LENGTH: | |
data_dict = {"steps": steps} | |
print(f"max_frame_idx: {max_frame_idx} ds factor: {DS_FACTOR} {len(steps)}") | |
yield data_dict | |