Spaces:
Running
on
Zero
Running
on
Zero
# -------------------------------------------------------- | |
# Licensed under The MIT License [see LICENSE for details] | |
# -------------------------------------------------------- | |
# https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes/blob/master/notebooks/demo.ipynb | |
import os | |
import numpy as np | |
from tqdm import tqdm | |
from collections import OrderedDict | |
import os | |
import numpy as np | |
from pathlib import Path | |
CURRENT_DIR = os.path.dirname(__file__) | |
import cv2 | |
from os.path import expanduser | |
from epic_kitchens.hoa.types import BBox, FloatVector, HandSide | |
from epic_kitchens.hoa import load_detections | |
RESOLUTION = (480, 480) | |
home = expanduser("~") | |
# Adjust these to the where-ever your detections and frames are stored. | |
DETECTION_ROOT = f"/checkpoint/xinleic/LR/epic-kitchens-100-hand-object-bboxes/labels/hand-objects" | |
FRAMES_ROOT = f"/datasets01/EPIC-KITCHENS-100" | |
# DETECTION_ROOT = f'{home}/Projects/epic_kitchen_labels/hand-objects' | |
# FRAMES_ROOT = f'{home}/EPIC-KITCHENS' | |
detections_root = Path(DETECTION_ROOT) | |
frames_root = Path(FRAMES_ROOT) | |
def compute_state_and_actions(curr_frame, next_frame): | |
curr_hand1, curr_hand2 = curr_frame.hands[0], curr_frame.hands[1] | |
if curr_hand1.side != HandSide.LEFT: # flip | |
curr_hand1, curr_hand2 = curr_hand2, curr_hand1 | |
# already normalized | |
curr_hand1_center = curr_hand1.bbox.center | |
curr_hand2_center = curr_hand2.bbox.center | |
next_hand1, next_hand2 = next_frame.hands[0], next_frame.hands[1] | |
if next_hand1.side != HandSide.LEFT: # flip | |
next_hand1, next_hand2 = next_hand2, next_hand1 | |
# already normalized even | |
next_hand1_center = next_hand1.bbox.center | |
next_hand2_center = next_hand2.bbox.center | |
state = np.concatenate((curr_hand1_center, curr_hand2_center)) | |
action = np.concatenate( | |
( | |
np.array(next_hand1_center) - np.array(curr_hand1_center), | |
np.array(next_hand2_center) - np.array(curr_hand2_center), | |
) | |
) | |
return state, action | |
# define your own dataset conversion | |
def convert_dataset_image(): | |
# convert to a list of episodes that can be added to replay buffer | |
ALL_EPISODES = os.listdir(FRAMES_ROOT) | |
MAX_EPISODE_LENGTH = 5000 | |
for EPS in ALL_EPISODES: | |
rgb_path = os.path.join(FRAMES_ROOT, EPS, "rgb_frames") | |
if not os.path.exists(rgb_path): | |
continue | |
for video_id in os.listdir(rgb_path): | |
full_path = os.path.join(rgb_path, video_id) | |
if ( | |
not full_path.endswith(".tar") and not full_path.endswith(".jpg") and not full_path.endswith("home") | |
): # folder | |
# action extractions over bounding boxes subtractions of both hands. | |
participant_id = video_id[:3] | |
video_detections = load_detections(detections_root / participant_id / (video_id + ".pkl")) | |
max_frame_idx = len(video_detections) - 1 | |
DS_FACTOR = 1 | |
print(full_path) | |
steps = [] | |
for frame_idx in range(0, max_frame_idx - DS_FACTOR, DS_FACTOR): | |
# print(video_detections[frame_idx].hands) | |
if ( | |
len(video_detections[frame_idx].hands) != 2 | |
or len(video_detections[frame_idx + DS_FACTOR].hands) != 2 | |
): | |
continue | |
s, a = compute_state_and_actions( | |
video_detections[frame_idx], video_detections[frame_idx + DS_FACTOR] | |
) | |
lang = "use human hands to do some tasks" # dummies | |
# print("state actions:", s, a) | |
image_path = frames_root / participant_id / "rgb_frames" / video_id / f"frame_{frame_idx:010d}.jpg" | |
# print(image_path) | |
image = cv2.imread(str(image_path)) | |
if image is None: | |
continue | |
image = image[..., [2, 1, 0]] # RGB | |
# break into step dict | |
step = { | |
"observation": {"image": image, "state": s}, | |
"action": a, | |
"language_instruction": lang, | |
} | |
steps.append(OrderedDict(step)) | |
if len(steps) > MAX_EPISODE_LENGTH: | |
break | |
data_dict = {"steps": steps} | |
print(f"max_frame_idx: {max_frame_idx} ds factor: {DS_FACTOR} {len(steps)}") | |
yield data_dict | |