Spaces:

liruiw
/

hma

Running on Zero

App Files Files Community

hma / datasets /extern /egoexo4d.py

LeroyWaa

draft

246c106 about 1 month ago

raw

history blame

7.22 kB

	# --------------------------------------------------------
	# Licensed under The MIT License [see LICENSE for details]
	# --------------------------------------------------------
	# https://github.com/epic-kitchens/epic-kitchens-100-hand-object-bboxes/blob/master/notebooks/demo.ipynb
	import os
	from typing import Iterable

	import numpy as np
	from tqdm import tqdm
	from collections import OrderedDict
	import os
	import numpy as np
	from pathlib import Path


	CURRENT_DIR = os.path.dirname(__file__)
	import cv2
	from os.path import expanduser
	import json


	# Adjust these to the where-ever your detections and frames are stored.
	CAM = "cam01" # cam01
	ROOT = "/datasets01/egoexo4d/v2/"
	LABEL_ROOT = ROOT + "annotations/ego_pose/train/hand/automatic/{}.json"
	VIDEO_PATH = ROOT + "takes/{}/frame_aligned_videos/{}.mp4"
	# from epic_kitchens.hoa import load_detections
	TAKE_ROOT = ROOT + "takes.json"



	def compute_state_and_actions(image, curr_frame, next_frame, idx, save=False):
	img_width, img_height = image.shape[1], image.shape[0]

	# already normalized
	curr_hand1_center = curr_frame[0]['annotation2D'][CAM]['left_wrist']
	curr_hand2_center = curr_frame[0]['annotation2D'][CAM]['right_wrist']

	# normalized them
	curr_hand1_center = np.array([curr_hand1_center['x'] / img_width, curr_hand1_center['y'] / img_height])
	curr_hand2_center = np.array([curr_hand2_center['x'] / img_width, curr_hand2_center['y'] / img_height])

	next_hand1_center = next_frame[0]['annotation2D'][CAM]['left_wrist']
	next_hand2_center = next_frame[0]['annotation2D'][CAM]['right_wrist']

	# normalize them
	next_hand1_center = np.array([next_hand1_center['x'] / img_width, next_hand1_center['y'] / img_height])
	next_hand2_center = np.array([next_hand2_center['x'] / img_width, next_hand2_center['y'] / img_height])


	state = np.concatenate((curr_hand1_center, curr_hand2_center)) # - np.array(curr_hand1_center) - np.array(curr_hand2_center)
	action = np.concatenate(
	(
	np.array(next_hand1_center),
	np.array(next_hand2_center),
	)
	)
	if save:
	# draw the bounding boxes
	cv2.circle(image, (int(curr_hand1_center[0] * img_width), int(curr_hand1_center[1] * img_height)), 10, (0, 255, 0), -1)
	cv2.circle(image, (int(curr_hand2_center[0] * img_width), int(curr_hand2_center[1] * img_height)), 10, (0, 255, 0), -1)
	cv2.circle(image, (int(next_hand1_center[0] * img_width), int(next_hand1_center[1] * img_height)), 10, (0, 0, 255), -1)
	cv2.circle(image, (int(next_hand2_center[0] * img_width), int(next_hand2_center[1] * img_height)), 10, (0, 0, 255), -1)
	# save the image
	cv2.imwrite(f"output/inspect/test_{idx}.png", image)
	return state, action


	def parse_raw_video(video_path):
	import cv2
	cap = cv2.VideoCapture(video_path)
	frames = []
	while cap.isOpened():
	ret, frame = cap.read()
	if not ret:
	break
	frames.append(frame)
	return frames

	def egoexo4d_dataset_size() -> int:
	""" Returns the number of takes in the dataset. ~5k for v2. """
	takes = json.load(open(TAKE_ROOT))
	return len(takes)


	# define your own dataset conversion
	def egoexo4d_dataset_generator(example_inds: Iterable[int] = None):
	"""
	Generator yielding data from Ego-Exo4D.
	Args:
	example_inds: if specified, will only yield data from these indices.
	Otherwise, will default to yielding the entire dataset.
	"""
	# convert to a list of episodes that can be added to replay buffer
	MAX_EPISODE_LENGTH = 5000
	TAKE_FILE = json.load(open(TAKE_ROOT))
	print("total takes", len(TAKE_FILE))
	# find the first camera with aria
	global CAM

	def find_aria_name(take):
	for cam in take['cameras']:
	if 'aria' in cam['name']:
	return cam['name']
	return None

	if example_inds is None:
	example_inds = range(len(TAKE_FILE))

	for example_ind in example_inds:
	take = TAKE_FILE[example_ind]
	take_name = take['take_name']
	take_uid = take['take_uid']
	# CAM = find_aria_name(take)
	# if CAM is None:
	# continue

	video_path = VIDEO_PATH.format(take_name, CAM)
	label_path = LABEL_ROOT.format(take_uid)

	if not os.path.exists(video_path) or not os.path.exists(label_path):
	continue

	video_frames = parse_raw_video(video_path)
	label_detections = json.load(open(label_path))
	print("video_path:", video_path)
	print("len video frames", len(video_frames))
	print("len label detections", len(label_detections))

	# action extractions over bounding boxes subtractions of both hands.
	max_frame_idx = len(video_frames) - 1
	DS_FACTOR = 1
	frame_idx = 0
	start_frame_idx = 0
	MIN_CLIP_LENGTH = 300

	def get_continuous_chunk(start_idx, label_detections):
	end_idx = start_idx + 1
	while str(start_idx) in label_detections and len(label_detections[str(start_idx)]) > 0 and str(end_idx) in label_detections and len(label_detections[str(end_idx)]) > 0:
	end_idx += 1
	return end_idx

	print("TAKE", take_name)

	# some frames might not have label. if there is a gap, skip
	while start_frame_idx < max_frame_idx - DS_FACTOR:
	# print(video_detections[frame_idx].hands)
	lang = "use human hands to do some tasks" # dummies
	if str(start_frame_idx) not in label_detections or str(start_frame_idx + DS_FACTOR) not in label_detections:
	start_frame_idx += DS_FACTOR
	continue

	end_frame_idx = get_continuous_chunk(start_frame_idx, label_detections)
	# print("start_frame_idx", start_frame_idx, end_frame_idx)

	if end_frame_idx - start_frame_idx < MIN_CLIP_LENGTH:
	start_frame_idx = end_frame_idx
	continue

	print("start clipping from", start_frame_idx, "to", end_frame_idx)
	steps = []
	for frame_idx in range(start_frame_idx, end_frame_idx - DS_FACTOR, DS_FACTOR):
	image = video_frames[frame_idx][...,[2,1,0]] # RGB
	try:
	s, a = compute_state_and_actions(
	image,
	label_detections[str(frame_idx)], label_detections[str(frame_idx + DS_FACTOR)],
	frame_idx, save=False
	)
	except:
	break
	# break into step dict
	step = {
	"observation": {"image": image, "state": s},
	"action": a,
	"language_instruction": lang,
	}
	steps.append(OrderedDict(step))
	if len(steps) > MAX_EPISODE_LENGTH:
	break

	start_frame_idx = end_frame_idx
	if len(steps) < MIN_CLIP_LENGTH:
	data_dict = {"steps": steps}
	print(f"max_frame_idx: {max_frame_idx} ds factor: {DS_FACTOR} {len(steps)}")
	yield data_dict