Spaces:

liruiw
/

hma

Runtime error

hma

File size: 7,947 Bytes

246c106

# --------------------------------------------------------
# Licensed under The MIT License [see LICENSE for details]
# --------------------------------------------------------
import os
from typing import Iterable

import numpy as np
from tqdm import tqdm
from collections import OrderedDict
import os
import numpy as np
from pathlib import Path

CURRENT_DIR = os.path.dirname(__file__)
import cv2
from os.path import expanduser
import json
import matplotlib.pyplot as plt

RESOLUTION = (480, 480)
home = expanduser("~")

# Adjust these to the where-ever your detections and frames are stored.
ROOT = "/datasets01/ego4d_track2/"
LABEL_ROOT = ROOT + "v2_1/annotations/fho_main.json"
VIDEO_PATH = ROOT + "v2_1/full_scale/"
# from epic_kitchens.hoa import load_detections

# labels = json.load(open("/datasets01/ego4d_track2/v2_1/annotations/fho_main.json"))
# videos = /datasets01/ego4d_track2/v2_1/clips
def parse_video_frame(video_path, frame_id):
    cap = cv2.VideoCapture(video_path)
    cap.set(cv2.CAP_PROP_POS_FRAMES, frame_id-1)
    ret, frame = cap.read()
    return frame

def parse_raw_video(video_path):
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    return frames

def compute_state_and_actions(image, curr_frame, next_frame, frame_idx, save=False):
    # curr_frame is a list of bounding box labels
    img_width, img_height = image.shape[1], image.shape[0]
    for box in curr_frame:
        if box['object_type'] == 'left_hand':
            curr_hand1_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]

        if box['object_type'] == 'right_hand':
            curr_hand2_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]

    for box in next_frame:
        if box['object_type'] == 'left_hand':
            next_hand1_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]

        if box['object_type'] == 'right_hand':
            next_hand2_center = [box['bbox']['x'] + box['bbox']['width'] / 2, box['bbox']['y'] + box['bbox']['height'] / 2]

    # normalized them
    curr_hand1_center = np.array([curr_hand1_center[0] / img_width, curr_hand1_center[1] / img_height])
    curr_hand2_center = np.array([curr_hand2_center[0] / img_width, curr_hand2_center[1] / img_height])

    # normalize them
    next_hand1_center = np.array([next_hand1_center[0] / img_width, next_hand1_center[1] / img_height])
    next_hand2_center = np.array([next_hand2_center[0] / img_width, next_hand2_center[1] / img_height])

    state = np.concatenate((curr_hand1_center, curr_hand2_center)) #  - np.array(curr_hand1_center)  - np.array(curr_hand2_center)
    action = np.concatenate(
        (
            np.array(next_hand1_center),
            np.array(next_hand2_center),
        )
    )
    if save:
        # draw the bounding boxes
        cv2.circle(image, (int(curr_hand1_center[0] * img_width), int(curr_hand1_center[1] * img_height)), 10, (0, 255, 0), -1)
        cv2.circle(image, (int(curr_hand2_center[0] * img_width), int(curr_hand2_center[1] * img_height)), 10, (0, 255, 0), -1)
        cv2.circle(image, (int(next_hand1_center[0] * img_width), int(next_hand1_center[1] * img_height)), 10, (0, 0, 255), -1)
        cv2.circle(image, (int(next_hand2_center[0] * img_width), int(next_hand2_center[1] * img_height)), 10, (0, 0, 255), -1)
        # save the image
        cv2.imwrite(f"/private/home/xinleic/LR/hpt_video/data/ego4d_video_label_check/img_{frame_idx}.png", image)
    return state, action


def parse_raw_video(video_path):
    import cv2
    cap = cv2.VideoCapture(video_path)
    frames = []
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        frames.append(frame)
    return frames

def chunk_actions_and_concatenate(actions):
    chunk_size = 4
    chunked_actions = [actions[i:i + chunk_size] for i in range(0, len(actions), chunk_size)][:-1]
    concatenated_frames = []

    for chunk in chunked_actions:
        frames_to_concat = []
        for action in chunk:
            frames = action['frames']  # Assuming 'frames' is a list or iterable
            if frames is not None:
                frames_to_concat.extend(frames)  # Collect frames from each action
        concatenated_frames.append(frames_to_concat)  # Store the concatenated frames for this chunk

    return concatenated_frames


def ego4d_dataset_size() -> int:
    """ Returns the number of trajectories in the dataset. ~1725 for Ego4D. """
    labels = json.load(open(LABEL_ROOT))
    return len(labels['videos'])


# define your own dataset conversion
def ego4d_dataset_generator(example_inds: Iterable[int] = None):
    """
    Generator yielding data from Ego4D.
    Args:
        example_inds: if specified, will only yield data from these indices.
            Otherwise, will default to yielding the entire dataset.
    """
    # convert to a list of episodes that can be added to replay buffer
    labels = json.load(open(LABEL_ROOT))

    if example_inds is None:
        example_inds = range(len(labels['videos']))

    for example_ind in example_inds:
        label = labels['videos'][example_ind]
        # ['annotated_intervals'][2]['narrated_actions']
        video_path =  VIDEO_PATH + label['video_uid'] + ".mp4"
        if not os.path.exists(video_path):
            print("skip", video_path)
            continue

        label_detections = labels
        print("video_path:", video_path)
        print("len label detections", len(label_detections))

        # action extractions over bounding boxes subtractions of both hands.
        for interval in label['annotated_intervals']:
            # print(video_detections[frame_idx].hands)

            lang = "use human hands to do some tasks"  # dummies
            # import IPython; IPython.embed()
            print(f"Interval [{interval['start_sec']} - {interval['end_sec']}]")
            actions = list(filter(lambda x: not (x['is_invalid_annotation'] or x['is_rejected']) and x['stage'] is not None, interval['narrated_actions']))
            print(f"Actions: {len(actions)}")

            # because we need to concatenate
            if len(actions) < 3:
                continue

            # the number of frames is usually 7 and it also does not follow strict 2hz
            chunk_actions = chunk_actions_and_concatenate(actions)
            for frame_idx, frames in enumerate(chunk_actions):
                # lang = frame['narration_text']
                steps = []
                # need to use dummy actions to expand from 6 frames to 16 frames
                for idx, frame in enumerate(frames[:-1]):
                    frame_id = frame['frame_number']
                    next_frame = frames[idx + 1]
                    image = parse_video_frame(video_path, frame_id)

                    if len(frame['boxes']) > 2 and len(next_frame['boxes']) > 2:
                        try:
                            s, a = compute_state_and_actions(image, frame['boxes'], next_frame['boxes'], idx, save=False)
                        except:
                            print(f'compute action failed idx {idx} frame idx {frame_idx}')
                            continue
                        # break into step dict
                        step = {
                            "observation": {"image": image, "state": s},
                            "action": a,
                            "language_instruction": lang,
                        }
                        steps.append(OrderedDict(step))

                if len(steps) < 16:
                    print("skip this traj because frame window length < 16")
                    continue
                data_dict = {"steps": steps}
                yield data_dict