File size: 7,218 Bytes
246c106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
# --------------------------------------------------------
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes/blob/master/notebooks/demo.ipynb
import os
from typing import Iterable

import numpy as np
from tqdm import tqdm
from collections import OrderedDict
import os
import numpy as np
from pathlib import Path


CURRENT_DIR = os.path.dirname(__file__)
import cv2
from os.path import expanduser
import json


# Adjust these to the where-ever your detections and frames are stored.
CAM = "cam01" # cam01
ROOT = "/datasets01/egoexo4d/v2/"
LABEL_ROOT = ROOT + "annotations/ego_pose/train/hand/automatic/{}.json"
VIDEO_PATH = ROOT + "takes/{}/frame_aligned_videos/{}.mp4"
# from epic_kitchens.hoa import load_detections
TAKE_ROOT = ROOT + "takes.json"



def compute_state_and_actions(image, curr_frame, next_frame, idx, save=False):
    img_width, img_height = image.shape[1], image.shape[0]

    # already normalized
    curr_hand1_center = curr_frame[0]['annotation2D'][CAM]['left_wrist']
    curr_hand2_center = curr_frame[0]['annotation2D'][CAM]['right_wrist']

    # normalized them
    curr_hand1_center = np.array([curr_hand1_center['x'] / img_width, curr_hand1_center['y'] / img_height])
    curr_hand2_center = np.array([curr_hand2_center['x'] / img_width, curr_hand2_center['y'] / img_height])

    next_hand1_center = next_frame[0]['annotation2D'][CAM]['left_wrist']
    next_hand2_center = next_frame[0]['annotation2D'][CAM]['right_wrist']

    # normalize them
    next_hand1_center = np.array([next_hand1_center['x'] / img_width, next_hand1_center['y'] / img_height])
    next_hand2_center = np.array([next_hand2_center['x'] / img_width, next_hand2_center['y'] / img_height])


    state = np.concatenate((curr_hand1_center, curr_hand2_center)) #  - np.array(curr_hand1_center)  - np.array(curr_hand2_center)
    action = np.concatenate(
        (
            np.array(next_hand1_center),
            np.array(next_hand2_center),
        )
    )
    if save:
        # draw the bounding boxes
        cv2.circle(image, (int(curr_hand1_center[0] * img_width), int(curr_hand1_center[1] * img_height)), 10, (0, 255, 0), -1)
        cv2.circle(image, (int(curr_hand2_center[0] * img_width), int(curr_hand2_center[1] * img_height)), 10, (0, 255, 0), -1)
        cv2.circle(image, (int(next_hand1_center[0] * img_width), int(next_hand1_center[1] * img_height)), 10, (0, 0, 255), -1)
        cv2.circle(image, (int(next_hand2_center[0] * img_width), int(next_hand2_center[1] * img_height)), 10, (0, 0, 255), -1)
        # save the image
        cv2.imwrite(f"output/inspect/test_{idx}.png", image)
    return state, action


def parse_raw_video(video_path):
    import cv2
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    return frames

def egoexo4d_dataset_size() -> int:
    """ Returns the number of takes in the dataset. ~5k for v2. """
    takes = json.load(open(TAKE_ROOT))
    return len(takes)


# define your own dataset conversion
def egoexo4d_dataset_generator(example_inds: Iterable[int] = None):
    """
    Generator yielding data from Ego-Exo4D.
    Args:
        example_inds: if specified, will only yield data from these indices.
            Otherwise, will default to yielding the entire dataset.
    """
    # convert to a list of episodes that can be added to replay buffer
    MAX_EPISODE_LENGTH = 5000
    TAKE_FILE = json.load(open(TAKE_ROOT))
    print("total takes", len(TAKE_FILE))
    # find the first camera with aria
    global CAM

    def find_aria_name(take):
        for cam in take['cameras']:
            if 'aria' in cam['name']:
                return cam['name']
        return None

    if example_inds is None:
        example_inds = range(len(TAKE_FILE))

    for example_ind in example_inds:
        take = TAKE_FILE[example_ind]
        take_name = take['take_name']
        take_uid = take['take_uid']
        # CAM = find_aria_name(take)
        # if CAM is None:
        #     continue

        video_path = VIDEO_PATH.format(take_name, CAM)
        label_path = LABEL_ROOT.format(take_uid)

        if not os.path.exists(video_path) or not os.path.exists(label_path):
            continue

        video_frames = parse_raw_video(video_path)
        label_detections = json.load(open(label_path))
        print("video_path:", video_path)
        print("len video frames", len(video_frames))
        print("len label detections", len(label_detections))

        # action extractions over bounding boxes subtractions of both hands.
        max_frame_idx = len(video_frames) - 1
        DS_FACTOR = 1
        frame_idx = 0
        start_frame_idx = 0
        MIN_CLIP_LENGTH = 300

        def get_continuous_chunk(start_idx, label_detections):
            end_idx = start_idx + 1
            while  str(start_idx) in label_detections  and len(label_detections[str(start_idx)]) > 0 and str(end_idx) in label_detections and len(label_detections[str(end_idx)]) > 0:
                end_idx += 1
            return end_idx

        print("TAKE", take_name)

        # some frames might not have label. if there is a gap, skip
        while start_frame_idx < max_frame_idx - DS_FACTOR:
            # print(video_detections[frame_idx].hands)
            lang = "use human hands to do some tasks"  # dummies
            if str(start_frame_idx) not in label_detections or str(start_frame_idx + DS_FACTOR) not in label_detections:
                start_frame_idx += DS_FACTOR
                continue

            end_frame_idx = get_continuous_chunk(start_frame_idx, label_detections)
            # print("start_frame_idx", start_frame_idx, end_frame_idx)

            if end_frame_idx - start_frame_idx < MIN_CLIP_LENGTH:
                start_frame_idx = end_frame_idx
                continue

            print("start clipping from", start_frame_idx, "to", end_frame_idx)
            steps = []
            for frame_idx in range(start_frame_idx, end_frame_idx - DS_FACTOR, DS_FACTOR):
                image = video_frames[frame_idx][...,[2,1,0]] # RGB
                try:
                    s, a = compute_state_and_actions(
                        image,
                        label_detections[str(frame_idx)], label_detections[str(frame_idx + DS_FACTOR)],
                        frame_idx, save=False
                    )
                except:
                    break
                # break into step dict
                step = {
                    "observation": {"image": image, "state": s},
                    "action": a,
                    "language_instruction": lang,
                }
                steps.append(OrderedDict(step))
                if len(steps) > MAX_EPISODE_LENGTH:
                    break

            start_frame_idx = end_frame_idx
            if len(steps) < MIN_CLIP_LENGTH:
                data_dict = {"steps": steps}
                print(f"max_frame_idx: {max_frame_idx} ds factor: {DS_FACTOR} {len(steps)}")
                yield data_dict