Spaces:

liruiw
/

hma

Running on Zero

File size: 7,218 Bytes

246c106

# --------------------------------------------------------
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes/blob/master/notebooks/demo.ipynb
import os
from typing import Iterable

import numpy as np
from tqdm import tqdm
from collections import OrderedDict
import os
import numpy as np
from pathlib import Path


CURRENT_DIR = os.path.dirname(__file__)
import cv2
from os.path import expanduser
import json


# Adjust these to the where-ever your detections and frames are stored.
CAM = "cam01" # cam01
ROOT = "/datasets01/egoexo4d/v2/"
LABEL_ROOT = ROOT + "annotations/ego_pose/train/hand/automatic/{}.json"
VIDEO_PATH = ROOT + "takes/{}/frame_aligned_videos/{}.mp4"
# from epic_kitchens.hoa import load_detections
TAKE_ROOT = ROOT + "takes.json"



def compute_state_and_actions(image, curr_frame, next_frame, idx, save=False):
    img_width, img_height = image.shape[1], image.shape[0]

    # already normalized
    curr_hand1_center = curr_frame[0]['annotation2D'][CAM]['left_wrist']
    curr_hand2_center = curr_frame[0]['annotation2D'][CAM]['right_wrist']

    # normalized them
    curr_hand1_center = np.array([curr_hand1_center['x'] / img_width, curr_hand1_center['y'] / img_height])
    curr_hand2_center = np.array([curr_hand2_center['x'] / img_width, curr_hand2_center['y'] / img_height])

    next_hand1_center = next_frame[0]['annotation2D'][CAM]['left_wrist']
    next_hand2_center = next_frame[0]['annotation2D'][CAM]['right_wrist']

    # normalize them
    next_hand1_center = np.array([next_hand1_center['x'] / img_width, next_hand1_center['y'] / img_height])
    next_hand2_center = np.array([next_hand2_center['x'] / img_width, next_hand2_center['y'] / img_height])


    state = np.concatenate((curr_hand1_center, curr_hand2_center)) #  - np.array(curr_hand1_center)  - np.array(curr_hand2_center)
    action = np.concatenate(
        (
            np.array(next_hand1_center),
            np.array(next_hand2_center),
        )
    )
    if save:
        # draw the bounding boxes
        cv2.circle(image, (int(curr_hand1_center[0] * img_width), int(curr_hand1_center[1] * img_height)), 10, (0, 255, 0), -1)
        cv2.circle(image, (int(curr_hand2_center[0] * img_width), int(curr_hand2_center[1] * img_height)), 10, (0, 255, 0), -1)
        cv2.circle(image, (int(next_hand1_center[0] * img_width), int(next_hand1_center[1] * img_height)), 10, (0, 0, 255), -1)
        cv2.circle(image, (int(next_hand2_center[0] * img_width), int(next_hand2_center[1] * img_height)), 10, (0, 0, 255), -1)
        # save the image
        cv2.imwrite(f"output/inspect/test_{idx}.png", image)
    return state, action


def parse_raw_video(video_path):
    import cv2
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    return frames

def egoexo4d_dataset_size() -> int:
    """ Returns the number of takes in the dataset. ~5k for v2. """
    takes = json.load(open(TAKE_ROOT))
    return len(takes)


# define your own dataset conversion
def egoexo4d_dataset_generator(example_inds: Iterable[int] = None):
    """
    Generator yielding data from Ego-Exo4D.
    Args:
        example_inds: if specified, will only yield data from these indices.
            Otherwise, will default to yielding the entire dataset.
    """
    # convert to a list of episodes that can be added to replay buffer
    MAX_EPISODE_LENGTH = 5000
    TAKE_FILE = json.load(open(TAKE_ROOT))
    print("total takes", len(TAKE_FILE))
    # find the first camera with aria
    global CAM

    def find_aria_name(take):
        for cam in take['cameras']:
            if 'aria' in cam['name']:
                return cam['name']
        return None

    if example_inds is None:
        example_inds = range(len(TAKE_FILE))

    for example_ind in example_inds:
        take = TAKE_FILE[example_ind]
        take_name = take['take_name']
        take_uid = take['take_uid']
        # CAM = find_aria_name(take)
        # if CAM is None:
        #     continue

        video_path = VIDEO_PATH.format(take_name, CAM)
        label_path = LABEL_ROOT.format(take_uid)

        if not os.path.exists(video_path) or not os.path.exists(label_path):
            continue

        video_frames = parse_raw_video(video_path)
        label_detections = json.load(open(label_path))
        print("video_path:", video_path)
        print("len video frames", len(video_frames))
        print("len label detections", len(label_detections))

        # action extractions over bounding boxes subtractions of both hands.
        max_frame_idx = len(video_frames) - 1
        DS_FACTOR = 1
        frame_idx = 0
        start_frame_idx = 0
        MIN_CLIP_LENGTH = 300

        def get_continuous_chunk(start_idx, label_detections):
            end_idx = start_idx + 1
            while  str(start_idx) in label_detections  and len(label_detections[str(start_idx)]) > 0 and str(end_idx) in label_detections and len(label_detections[str(end_idx)]) > 0:
                end_idx += 1
            return end_idx

        print("TAKE", take_name)

        # some frames might not have label. if there is a gap, skip
        while start_frame_idx < max_frame_idx - DS_FACTOR:
            # print(video_detections[frame_idx].hands)
            lang = "use human hands to do some tasks"  # dummies
            if str(start_frame_idx) not in label_detections or str(start_frame_idx + DS_FACTOR) not in label_detections:
                start_frame_idx += DS_FACTOR
                continue

            end_frame_idx = get_continuous_chunk(start_frame_idx, label_detections)
            # print("start_frame_idx", start_frame_idx, end_frame_idx)

            if end_frame_idx - start_frame_idx < MIN_CLIP_LENGTH:
                start_frame_idx = end_frame_idx
                continue

            print("start clipping from", start_frame_idx, "to", end_frame_idx)
            steps = []
            for frame_idx in range(start_frame_idx, end_frame_idx - DS_FACTOR, DS_FACTOR):
                image = video_frames[frame_idx][...,[2,1,0]] # RGB
                try:
                    s, a = compute_state_and_actions(
                        image,
                        label_detections[str(frame_idx)], label_detections[str(frame_idx + DS_FACTOR)],
                        frame_idx, save=False
                    )
                except:
                    break
                # break into step dict
                step = {
                    "observation": {"image": image, "state": s},
                    "action": a,
                    "language_instruction": lang,
                }
                steps.append(OrderedDict(step))
                if len(steps) > MAX_EPISODE_LENGTH:
                    break

            start_frame_idx = end_frame_idx
            if len(steps) < MIN_CLIP_LENGTH:
                data_dict = {"steps": steps}
                print(f"max_frame_idx: {max_frame_idx} ds factor: {DS_FACTOR} {len(steps)}")
                yield data_dict