File size: 7,947 Bytes
246c106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
# --------------------------------------------------------
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import os
from typing import Iterable

import numpy as np
from tqdm import tqdm
from collections import OrderedDict
import os
import numpy as np
from pathlib import Path

CURRENT_DIR = os.path.dirname(__file__)
import cv2
from os.path import expanduser
import json
import matplotlib.pyplot as plt

RESOLUTION = (480, 480)
home = expanduser("~")

# Adjust these to the where-ever your detections and frames are stored.
ROOT = "/datasets01/ego4d_track2/"
LABEL_ROOT = ROOT + "v2_1/annotations/fho_main.json"
VIDEO_PATH = ROOT + "v2_1/full_scale/"
# from epic_kitchens.hoa import load_detections

# labels = json.load(open("/datasets01/ego4d_track2/v2_1/annotations/fho_main.json"))
# videos = /datasets01/ego4d_track2/v2_1/clips
def parse_video_frame(video_path, frame_id):
    cap = cv2.VideoCapture(video_path)
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id-1)
    ret, frame = cap.read()
    return frame

def parse_raw_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    return frames

def compute_state_and_actions(image, curr_frame, next_frame, frame_idx, save=False):
    # curr_frame is a list of bounding box labels
    img_width, img_height = image.shape[1], image.shape[0]
    for box in curr_frame:
        if box['object_type'] == 'left_hand':
            curr_hand1_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]

        if box['object_type'] == 'right_hand':
            curr_hand2_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]

    for box in next_frame:
        if box['object_type'] == 'left_hand':
            next_hand1_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]

        if box['object_type'] == 'right_hand':
            next_hand2_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]

    # normalized them
    curr_hand1_center = np.array([curr_hand1_center[0] / img_width, curr_hand1_center[1] / img_height])
    curr_hand2_center = np.array([curr_hand2_center[0] / img_width, curr_hand2_center[1] / img_height])

    # normalize them
    next_hand1_center = np.array([next_hand1_center[0] / img_width, next_hand1_center[1] / img_height])
    next_hand2_center = np.array([next_hand2_center[0] / img_width, next_hand2_center[1] / img_height])

    state = np.concatenate((curr_hand1_center, curr_hand2_center)) #  - np.array(curr_hand1_center)  - np.array(curr_hand2_center)
    action = np.concatenate(
        (
            np.array(next_hand1_center),
            np.array(next_hand2_center),
        )
    )
    if save:
        # draw the bounding boxes
        cv2.circle(image, (int(curr_hand1_center[0] * img_width), int(curr_hand1_center[1] * img_height)), 10, (0, 255, 0), -1)
        cv2.circle(image, (int(curr_hand2_center[0] * img_width), int(curr_hand2_center[1] * img_height)), 10, (0, 255, 0), -1)
        cv2.circle(image, (int(next_hand1_center[0] * img_width), int(next_hand1_center[1] * img_height)), 10, (0, 0, 255), -1)
        cv2.circle(image, (int(next_hand2_center[0] * img_width), int(next_hand2_center[1] * img_height)), 10, (0, 0, 255), -1)
        # save the image
        cv2.imwrite(f"/private/home/xinleic/LR/hpt_video/data/ego4d_video_label_check/img_{frame_idx}.png", image)
    return state, action


def parse_raw_video(video_path):
    import cv2
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    return frames

def chunk_actions_and_concatenate(actions):
    chunk_size = 4
    chunked_actions = [actions[i:i + chunk_size] for i in range(0, len(actions), chunk_size)][:-1]
    concatenated_frames = []

    for chunk in chunked_actions:
        frames_to_concat = []
        for action in chunk:
            frames = action['frames']  # Assuming 'frames' is a list or iterable
            if frames is not None:
                frames_to_concat.extend(frames)  # Collect frames from each action
        concatenated_frames.append(frames_to_concat)  # Store the concatenated frames for this chunk

    return concatenated_frames


def ego4d_dataset_size() -> int:
    """ Returns the number of trajectories in the dataset. ~1725 for Ego4D. """
    labels = json.load(open(LABEL_ROOT))
    return len(labels['videos'])


# define your own dataset conversion
def ego4d_dataset_generator(example_inds: Iterable[int] = None):
    """
    Generator yielding data from Ego4D.
    Args:
        example_inds: if specified, will only yield data from these indices.
            Otherwise, will default to yielding the entire dataset.
    """
    # convert to a list of episodes that can be added to replay buffer
    labels = json.load(open(LABEL_ROOT))

    if example_inds is None:
        example_inds = range(len(labels['videos']))

    for example_ind in example_inds:
        label = labels['videos'][example_ind]
        # ['annotated_intervals'][2]['narrated_actions']
        video_path =  VIDEO_PATH + label['video_uid'] + ".mp4"
        if not os.path.exists(video_path):
            print("skip", video_path)
            continue

        label_detections = labels
        print("video_path:", video_path)
        print("len label detections", len(label_detections))

        # action extractions over bounding boxes subtractions of both hands.
        for interval in label['annotated_intervals']:
            # print(video_detections[frame_idx].hands)

            lang = "use human hands to do some tasks"  # dummies
            # import IPython; IPython.embed()
            print(f"Interval [{interval['start_sec']} - {interval['end_sec']}]")
            actions = list(filter(lambda x: not (x['is_invalid_annotation'] or x['is_rejected']) and x['stage'] is not None, interval['narrated_actions']))
            print(f"Actions: {len(actions)}")

            # because we need to concatenate
            if len(actions) < 3:
                continue

            # the number of frames is usually 7 and it also does not follow strict 2hz
            chunk_actions = chunk_actions_and_concatenate(actions)
            for frame_idx, frames in enumerate(chunk_actions):
                # lang = frame['narration_text']
                steps = []
                # need to use dummy actions to expand from 6 frames to 16 frames
                for idx, frame in enumerate(frames[:-1]):
                    frame_id = frame['frame_number']
                    next_frame = frames[idx + 1]
                    image = parse_video_frame(video_path, frame_id)

                    if len(frame['boxes']) > 2 and len(next_frame['boxes']) > 2:
                        try:
                            s, a = compute_state_and_actions(image, frame['boxes'], next_frame['boxes'], idx, save=False)
                        except:
                            print(f'compute action failed idx {idx} frame idx {frame_idx}')
                            continue
                        # break into step dict
                        step = {
                            "observation": {"image": image, "state": s},
                            "action": a,
                            "language_instruction": lang,
                        }
                        steps.append(OrderedDict(step))

                if len(steps) < 16:
                    print("skip this traj because frame window length < 16")
                    continue
                data_dict = {"steps": steps}
                yield data_dict