hma / datasets /extern /epic_kitchen.py
LeroyWaa's picture
draft
246c106
raw
history blame
4.61 kB
# --------------------------------------------------------
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes/blob/master/notebooks/demo.ipynb
import os
import numpy as np
from tqdm import tqdm
from collections import OrderedDict
import os
import numpy as np
from pathlib import Path
CURRENT_DIR = os.path.dirname(__file__)
import cv2
from os.path import expanduser
from epic_kitchens.hoa.types import BBox, FloatVector, HandSide
from epic_kitchens.hoa import load_detections
RESOLUTION = (480, 480)
home = expanduser("~")
# Adjust these to the where-ever your detections and frames are stored.
DETECTION_ROOT = f"/checkpoint/xinleic/LR/epic-kitchens-100-hand-object-bboxes/labels/hand-objects"
FRAMES_ROOT = f"/datasets01/EPIC-KITCHENS-100"
# DETECTION_ROOT = f'{home}/Projects/epic_kitchen_labels/hand-objects'
# FRAMES_ROOT = f'{home}/EPIC-KITCHENS'
detections_root = Path(DETECTION_ROOT)
frames_root = Path(FRAMES_ROOT)
def compute_state_and_actions(curr_frame, next_frame):
curr_hand1, curr_hand2 = curr_frame.hands[0], curr_frame.hands[1]
if curr_hand1.side != HandSide.LEFT: # flip
curr_hand1, curr_hand2 = curr_hand2, curr_hand1
# already normalized
curr_hand1_center = curr_hand1.bbox.center
curr_hand2_center = curr_hand2.bbox.center
next_hand1, next_hand2 = next_frame.hands[0], next_frame.hands[1]
if next_hand1.side != HandSide.LEFT: # flip
next_hand1, next_hand2 = next_hand2, next_hand1
# already normalized even
next_hand1_center = next_hand1.bbox.center
next_hand2_center = next_hand2.bbox.center
state = np.concatenate((curr_hand1_center, curr_hand2_center))
action = np.concatenate(
(
np.array(next_hand1_center) - np.array(curr_hand1_center),
np.array(next_hand2_center) - np.array(curr_hand2_center),
)
)
return state, action
# define your own dataset conversion
def convert_dataset_image():
# convert to a list of episodes that can be added to replay buffer
ALL_EPISODES = os.listdir(FRAMES_ROOT)
MAX_EPISODE_LENGTH = 5000
for EPS in ALL_EPISODES:
rgb_path = os.path.join(FRAMES_ROOT, EPS, "rgb_frames")
if not os.path.exists(rgb_path):
continue
for video_id in os.listdir(rgb_path):
full_path = os.path.join(rgb_path, video_id)
if (
not full_path.endswith(".tar") and not full_path.endswith(".jpg") and not full_path.endswith("home")
): # folder
# action extractions over bounding boxes subtractions of both hands.
participant_id = video_id[:3]
video_detections = load_detections(detections_root / participant_id / (video_id + ".pkl"))
max_frame_idx = len(video_detections) - 1
DS_FACTOR = 1
print(full_path)
steps = []
for frame_idx in range(0, max_frame_idx - DS_FACTOR, DS_FACTOR):
# print(video_detections[frame_idx].hands)
if (
len(video_detections[frame_idx].hands) != 2
or len(video_detections[frame_idx + DS_FACTOR].hands) != 2
):
continue
s, a = compute_state_and_actions(
video_detections[frame_idx], video_detections[frame_idx + DS_FACTOR]
)
lang = "use human hands to do some tasks" # dummies
# print("state actions:", s, a)
image_path = frames_root / participant_id / "rgb_frames" / video_id / f"frame_{frame_idx:010d}.jpg"
# print(image_path)
image = cv2.imread(str(image_path))
if image is None:
continue
image = image[..., [2, 1, 0]] # RGB
# break into step dict
step = {
"observation": {"image": image, "state": s},
"action": a,
"language_instruction": lang,
}
steps.append(OrderedDict(step))
if len(steps) > MAX_EPISODE_LENGTH:
break
data_dict = {"steps": steps}
print(f"max_frame_idx: {max_frame_idx} ds factor: {DS_FACTOR} {len(steps)}")
yield data_dict