File size: 4,607 Bytes
246c106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
# --------------------------------------------------------
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
# https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes/blob/master/notebooks/demo.ipynb
import os
import numpy as np
from tqdm import tqdm
from collections import OrderedDict
import os
import numpy as np
from pathlib import Path


CURRENT_DIR = os.path.dirname(__file__)
import cv2
from os.path import expanduser
from epic_kitchens.hoa.types import BBox, FloatVector, HandSide
from epic_kitchens.hoa import load_detections

RESOLUTION = (480, 480)
home = expanduser("~")

# Adjust these to the where-ever your detections and frames are stored.
DETECTION_ROOT = f"/checkpoint/xinleic/LR/epic-kitchens-100-hand-object-bboxes/labels/hand-objects"
FRAMES_ROOT = f"/datasets01/EPIC-KITCHENS-100"

# DETECTION_ROOT = f'{home}/Projects/epic_kitchen_labels/hand-objects'
# FRAMES_ROOT = f'{home}/EPIC-KITCHENS'
detections_root = Path(DETECTION_ROOT)
frames_root = Path(FRAMES_ROOT)


def compute_state_and_actions(curr_frame, next_frame):
    curr_hand1, curr_hand2 = curr_frame.hands[0], curr_frame.hands[1]
    if curr_hand1.side != HandSide.LEFT:  # flip
        curr_hand1, curr_hand2 = curr_hand2, curr_hand1

    # already normalized
    curr_hand1_center = curr_hand1.bbox.center
    curr_hand2_center = curr_hand2.bbox.center

    next_hand1, next_hand2 = next_frame.hands[0], next_frame.hands[1]
    if next_hand1.side != HandSide.LEFT:  # flip
        next_hand1, next_hand2 = next_hand2, next_hand1

    # already normalized even
    next_hand1_center = next_hand1.bbox.center
    next_hand2_center = next_hand2.bbox.center
    state = np.concatenate((curr_hand1_center, curr_hand2_center))
    action = np.concatenate(
        (
            np.array(next_hand1_center) - np.array(curr_hand1_center),
            np.array(next_hand2_center) - np.array(curr_hand2_center),
        )
    )
    return state, action


# define your own dataset conversion
def convert_dataset_image():
    # convert to a list of episodes that can be added to replay buffer
    ALL_EPISODES = os.listdir(FRAMES_ROOT)
    MAX_EPISODE_LENGTH = 5000

    for EPS in ALL_EPISODES:
        rgb_path = os.path.join(FRAMES_ROOT, EPS, "rgb_frames")
        if not os.path.exists(rgb_path):
            continue
        for video_id in os.listdir(rgb_path):
            full_path = os.path.join(rgb_path, video_id)
            if (
                not full_path.endswith(".tar") and not full_path.endswith(".jpg") and not full_path.endswith("home")
            ):  # folder

                # action extractions over bounding boxes subtractions of both hands.
                participant_id = video_id[:3]
                video_detections = load_detections(detections_root / participant_id / (video_id + ".pkl"))
                max_frame_idx = len(video_detections) - 1
                DS_FACTOR = 1
                print(full_path)
                steps = []

                for frame_idx in range(0, max_frame_idx - DS_FACTOR, DS_FACTOR):
                    # print(video_detections[frame_idx].hands)
                    if (
                        len(video_detections[frame_idx].hands) != 2
                        or len(video_detections[frame_idx + DS_FACTOR].hands) != 2
                    ):
                        continue

                    s, a = compute_state_and_actions(
                        video_detections[frame_idx], video_detections[frame_idx + DS_FACTOR]
                    )
                    lang = "use human hands to do some tasks"  # dummies
                    # print("state actions:", s, a)
                    image_path = frames_root / participant_id / "rgb_frames" / video_id / f"frame_{frame_idx:010d}.jpg"
                    # print(image_path)
                    image = cv2.imread(str(image_path))
                    if image is None:
                        continue
                    image = image[..., [2, 1, 0]]  # RGB

                    # break into step dict
                    step = {
                        "observation": {"image": image, "state": s},
                        "action": a,
                        "language_instruction": lang,
                    }
                    steps.append(OrderedDict(step))
                    if len(steps) > MAX_EPISODE_LENGTH:
                        break
                data_dict = {"steps": steps}
                print(f"max_frame_idx: {max_frame_idx} ds factor: {DS_FACTOR} {len(steps)}")
                yield data_dict