Spaces:

yeliudev
/

VideoMind-2B

Running on Zero

App Files Files Community

VideoMind-2B / videomind /eval /eval_qvhighlights.py

yeliudev

Upload folder using huggingface_hub

23fdbc0 verified 4 months ago

raw

history blame contribute delete

17.2 kB

	# Modified from https://github.com/showlab/UniVTG/blob/main/eval/eval.py

	import argparse
	import copy
	from collections import OrderedDict, defaultdict

	import nncore
	import numpy as np

	from sklearn.metrics import precision_recall_curve


	def compute_temporal_iou_batch_paired(a, b):
	intersection = np.maximum(0, np.minimum(a[:, 1], b[:, 1]) - np.maximum(a[:, 0], b[:, 0]))
	union = np.maximum(a[:, 1], b[:, 1]) - np.minimum(a[:, 0], b[:, 0])
	return np.divide(intersection, union, out=np.zeros_like(intersection), where=union != 0)


	def compute_temporal_iou_batch_cross(spans1, spans2):
	areas1 = spans1[:, 1] - spans1[:, 0]
	areas2 = spans2[:, 1] - spans2[:, 0]
	l = np.maximum(spans1[:, None, 0], spans2[None, :, 0])
	r = np.minimum(spans1[:, None, 1], spans2[None, :, 1])
	inter = np.clip(r - l, 0, None)
	union = areas1[:, None] + areas2[None, :] - inter
	iou = inter / union
	return iou, union


	def interpolated_precision_recall(prc, rec):
	mprc = np.hstack([[0], prc, [0]])
	mrec = np.hstack([[0], rec, [1]])
	for i in range(len(mprc) - 1)[::-1]:
	mprc[i] = max(mprc[i], mprc[i + 1])
	idx = np.where(mrec[1::] != mrec[0:-1])[0] + 1
	ap = np.sum((mrec[idx] - mrec[idx - 1]) * mprc[idx])
	return ap


	def compute_average_precision_detection(annos, prediction, tiou_thresholds=np.linspace(0.5, 0.95, 10)):
	num_thresholds = len(tiou_thresholds)
	num_gts = len(annos)
	num_preds = len(prediction)
	ap = np.zeros(num_thresholds)
	if len(prediction) == 0:
	return ap

	num_positive = float(num_gts)
	lock_gt = np.ones((num_thresholds, num_gts)) * -1
	prediction.sort(key=lambda x: -x['score'])
	tp = np.zeros((num_thresholds, num_preds))
	fp = np.zeros((num_thresholds, num_preds))

	ground_truth_by_videoid = dict()
	for i, item in enumerate(annos):
	item['index'] = i
	ground_truth_by_videoid.setdefault(item['video-id'], []).append(item)

	for idx, pred in enumerate(prediction):
	if pred['video-id'] in ground_truth_by_videoid:
	gts = ground_truth_by_videoid[pred['video-id']]
	else:
	fp[:, idx] = 1
	continue

	_pred = np.array([[pred['t-start'], pred['t-end']]])
	_gt = np.array([[gt['t-start'], gt['t-end']] for gt in gts])
	tiou_arr = compute_temporal_iou_batch_cross(_pred, _gt)[0]

	tiou_arr = tiou_arr.reshape(-1)
	tiou_sorted_idx = tiou_arr.argsort()[::-1]
	for t_idx, tiou_threshold in enumerate(tiou_thresholds):
	for j_idx in tiou_sorted_idx:
	if tiou_arr[j_idx] < tiou_threshold:
	fp[t_idx, idx] = 1
	break
	if lock_gt[t_idx, gts[j_idx]['index']] >= 0:
	continue
	tp[t_idx, idx] = 1
	lock_gt[t_idx, gts[j_idx]['index']] = idx
	break

	if fp[t_idx, idx] == 0 and tp[t_idx, idx] == 0:
	fp[t_idx, idx] = 1

	tp_cumsum = np.cumsum(tp, axis=1).astype(float)
	fp_cumsum = np.cumsum(fp, axis=1).astype(float)
	recall_cumsum = tp_cumsum / num_positive

	precision_cumsum = tp_cumsum / (tp_cumsum + fp_cumsum)

	for t_idx in range(len(tiou_thresholds)):
	ap[t_idx] = interpolated_precision_recall(precision_cumsum[t_idx, :], recall_cumsum[t_idx, :])

	return ap


	def get_ap(y_true, y_pred, interpolate=True, point_11=False):
	assert len(y_true) == len(y_pred), 'Prediction and ground truth need to be of the same length'
	if len(set(y_true)) == 1:
	if y_true[0] == 0:
	return 0
	else:
	return 1
	else:
	assert sorted(set(y_true)) == [0, 1], 'Ground truth can only contain elements {0,1}'

	precision, recall, _ = precision_recall_curve(y_true, y_pred)
	recall = recall.astype(np.float32)

	if interpolate:
	for i in range(1, len(precision)):
	precision[i] = max(precision[i - 1], precision[i])

	if point_11:
	precision_11 = [precision[np.where(recall >= t)[0][-1]] for t in np.arange(0, 1.01, 0.1)]
	return np.mean(precision_11)
	else:
	indices = np.where(np.diff(recall))
	return np.mean(precision[indices])


	def compute_average_precision_detection_wrapper(input_triple, tiou_thresholds=np.linspace(0.5, 0.95, 10)):
	qid, annos, prediction = input_triple
	scores = compute_average_precision_detection(annos, prediction, tiou_thresholds=tiou_thresholds)
	return qid, scores


	def compute_mr_ap(preds, annos, iou_thds=np.linspace(0.5, 0.95, 10), max_gt_windows=None, max_pred_windows=10):
	iou_thds = [float(f'{e:.2f}') for e in iou_thds]
	pred_qid2data = defaultdict(list)
	for d in preds:
	pred_windows = d['pred_relevant_windows'][:max_pred_windows] \
	if max_pred_windows is not None else d['pred_relevant_windows']
	qid = d['qid']
	for w in pred_windows:
	pred_qid2data[qid].append({'video-id': d['qid'], 't-start': w[0], 't-end': w[1], 'score': w[2]})

	gt_qid2data = defaultdict(list)
	for d in annos:
	gt_windows = d['relevant_windows'][:max_gt_windows] \
	if max_gt_windows is not None else d['relevant_windows']
	qid = d['qid']
	for w in gt_windows:
	gt_qid2data[qid].append({'video-id': d['qid'], 't-start': w[0], 't-end': w[1]})
	qid2ap_list = dict()
	data_triples = [[qid, gt_qid2data[qid], pred_qid2data[qid]] for qid in pred_qid2data]
	from functools import partial
	compute_ap_from_triple = partial(compute_average_precision_detection_wrapper, tiou_thresholds=iou_thds)

	for data_triple in data_triples:
	qid, scores = compute_ap_from_triple(data_triple)
	qid2ap_list[qid] = scores

	ap_array = np.array(list(qid2ap_list.values()))
	ap_thds = ap_array.mean(0)
	iou_thd2ap = dict(zip([str(e) for e in iou_thds], ap_thds))
	iou_thd2ap['average'] = np.mean(ap_thds)

	iou_thd2ap = {k: float(f'{100 * v:.2f}') for k, v in iou_thd2ap.items()}
	return iou_thd2ap


	def compute_mr_r1(preds, annos, iou_thds=np.linspace(0.3, 0.95, 14)):
	iou_thds = [float(f'{e:.2f}') for e in iou_thds]
	pred_qid2window = {d['qid']: d['pred_relevant_windows'][0][:2] for d in preds}
	gt_qid2window = dict()
	for d in annos:
	cur_gt_windows = d['relevant_windows']
	cur_qid = d['qid']
	cur_max_iou_idx = 0
	if len(cur_gt_windows) > 0:
	cur_ious = compute_temporal_iou_batch_cross(
	np.array([pred_qid2window[cur_qid]]), np.array(d['relevant_windows']))[0]
	cur_max_iou_idx = np.argmax(cur_ious)
	gt_qid2window[cur_qid] = cur_gt_windows[cur_max_iou_idx]

	qids = list(pred_qid2window.keys())
	pred_windows = np.array([pred_qid2window[k] for k in qids]).astype(float)
	gt_windows = np.array([gt_qid2window[k] for k in qids]).astype(float)
	pred_gt_iou = compute_temporal_iou_batch_paired(pred_windows, gt_windows)
	iou_thd2recall_at_one = dict()
	miou_at_one = float(f'{np.mean(pred_gt_iou) * 100:.2f}')
	for thd in iou_thds:
	iou_thd2recall_at_one[str(thd)] = float(f'{np.mean(pred_gt_iou >= thd) * 100:.2f}')
	return iou_thd2recall_at_one, miou_at_one


	def compute_mr_r5(preds, annos, iou_thds=np.linspace(0.3, 0.95, 14)):
	iou_thds = [float(f'{e:.2f}') for e in iou_thds]
	pred_qid2window = {d['qid']: [x[:2] for x in d['pred_relevant_windows'][:5]] for d in preds}
	gt_qid2window = dict()
	pred_optimal_qid2window = dict()
	for d in annos:
	cur_gt_windows = d['relevant_windows']
	cur_qid = d['qid']
	cur_max_iou_pred = 0
	cur_max_iou_gt = 0
	if len(cur_gt_windows) > 0:
	cur_ious = compute_temporal_iou_batch_cross(
	np.array(pred_qid2window[cur_qid]), np.array(d['relevant_windows']))[0]
	cur_ious[np.isnan(cur_ious)] = 0
	cur_max_iou_pred, cur_max_iou_gt = np.where(cur_ious == np.max(cur_ious))
	cur_max_iou_pred, cur_max_iou_gt = cur_max_iou_pred[0], cur_max_iou_gt[0]
	pred_optimal_qid2window[cur_qid] = pred_qid2window[cur_qid][cur_max_iou_pred]
	gt_qid2window[cur_qid] = cur_gt_windows[cur_max_iou_gt]

	qids = list(pred_qid2window.keys())
	pred_windows = np.array([pred_optimal_qid2window[k] for k in qids]).astype(float)
	gt_windows = np.array([gt_qid2window[k] for k in qids]).astype(float)
	pred_gt_iou = compute_temporal_iou_batch_paired(pred_windows, gt_windows)
	iou_thd2recall_at_one = dict()
	for thd in iou_thds:
	iou_thd2recall_at_one[str(thd)] = float(f'{np.mean(pred_gt_iou >= thd) * 100:.2f}')
	return iou_thd2recall_at_one


	def get_data_by_range(preds, annos, len_range):
	min_l, max_l = len_range
	if min_l == 0 and max_l == float('inf'):
	return preds, annos

	ground_truth_in_range = []
	gt_qids_in_range = set()
	for d in annos:
	rel_windows_in_range = [w for w in d['relevant_windows'] if min_l < (w[1] - w[0]) <= max_l]
	if len(rel_windows_in_range) > 0:
	d = copy.deepcopy(d)
	d['relevant_windows'] = rel_windows_in_range
	ground_truth_in_range.append(d)
	gt_qids_in_range.add(d['qid'])

	submission_in_range = []
	for d in preds:
	if d['qid'] in gt_qids_in_range:
	submission_in_range.append(copy.deepcopy(d))

	if submission_in_range == ground_truth_in_range == []:
	return preds, annos

	return submission_in_range, ground_truth_in_range


	def eval_moment_retrieval(preds, annos):
	length_ranges = [[0, 10], [10, 30], [30, float('inf')], [0, float('inf')]]
	range_names = ['short', 'middle', 'long', 'full']

	ret_metrics = dict()
	for l_range, name in zip(length_ranges, range_names):
	_submission, _ground_truth = get_data_by_range(preds, annos, l_range)
	print(f'{name}: {l_range}, {len(_ground_truth)}/{len(annos)}={100*len(_ground_truth)/len(annos):.2f} samples')
	iou_thd2average_precision = compute_mr_ap(_submission, _ground_truth)
	iou_thd2recall_at_one, miou_at_one = compute_mr_r1(_submission, _ground_truth)
	iou_thd2recall_at_five = compute_mr_r5(_submission, _ground_truth)
	ret_metrics[name] = {
	'MR-mIoU': miou_at_one,
	'MR-mAP': iou_thd2average_precision,
	'MR-R1': iou_thd2recall_at_one,
	'MR-R5': iou_thd2recall_at_five
	}

	return ret_metrics


	def compute_hl_hit1(qid2preds, qid2gt_scores_binary):
	qid2max_scored_clip_idx = {k: np.argmax(v['pred_saliency_scores']) for k, v in qid2preds.items()}
	hit_scores = np.zeros((len(qid2preds), 3))
	qids = list(qid2preds.keys())
	for idx, qid in enumerate(qids):
	pred_clip_idx = qid2max_scored_clip_idx[qid]
	gt_scores_binary = qid2gt_scores_binary[qid]
	if pred_clip_idx < len(gt_scores_binary):
	hit_scores[idx] = gt_scores_binary[pred_clip_idx]
	hit_at_one = float(f'{100 * np.mean(np.max(hit_scores, 1)):.2f}')
	return hit_at_one


	def compute_hl_ap(qid2preds, qid2gt_scores_binary):
	qid2pred_scores = {k: v['pred_saliency_scores'] for k, v in qid2preds.items()}
	ap_scores = np.zeros((len(qid2preds), 3))
	qids = list(qid2preds.keys())
	input_tuples = []
	for idx, qid in enumerate(qids):
	for w_idx in range(3):
	y_true = qid2gt_scores_binary[qid][:, w_idx]
	y_pred = np.array(qid2pred_scores[qid])
	input_tuples.append((idx, w_idx, y_true, y_pred))

	for input_tuple in input_tuples:
	idx, w_idx, score = compute_ap_from_tuple(input_tuple)
	ap_scores[idx, w_idx] = score

	mean_ap = float(f'{100 * np.mean(ap_scores):.2f}')
	return mean_ap


	def compute_ap_from_tuple(input_tuple):
	idx, w_idx, y_true, y_pred = input_tuple
	if len(y_true) < len(y_pred):
	y_pred = y_pred[:len(y_true)]
	elif len(y_true) > len(y_pred):
	_y_predict = np.zeros(len(y_true))
	_y_predict[:len(y_pred)] = y_pred
	y_pred = _y_predict

	score = get_ap(y_true, y_pred)
	return idx, w_idx, score


	def mk_gt_scores(gt_data, clip_length=2):
	num_clips = int(gt_data['duration'] / clip_length)
	saliency_scores_full_video = np.zeros((num_clips, 3))
	relevant_clip_ids = np.array(gt_data['relevant_clip_ids'])
	saliency_scores_relevant_clips = np.array(gt_data['saliency_scores'])
	saliency_scores_full_video[relevant_clip_ids] = saliency_scores_relevant_clips
	return saliency_scores_full_video


	def eval_highlight(preds, annos):
	qid2preds = {d['qid']: d for d in preds}
	qid2gt_scores_full_range = {d['qid']: mk_gt_scores(d) for d in annos}
	gt_saliency_score_min_list = [2, 3, 4]
	saliency_score_names = ['Fair', 'Good', 'VeryGood']
	highlight_det_metrics = dict()
	for gt_saliency_score_min, score_name in zip(gt_saliency_score_min_list, saliency_score_names):
	qid2gt_scores_binary = {
	k: (v >= gt_saliency_score_min).astype(float)
	for k, v in qid2gt_scores_full_range.items()
	}
	hit_at_one = compute_hl_hit1(qid2preds, qid2gt_scores_binary)
	mean_ap = compute_hl_ap(qid2preds, qid2gt_scores_binary)
	highlight_det_metrics[f'HL-min-{score_name}'] = {'HL-mAP': mean_ap, 'HL-Hit1': hit_at_one}
	return highlight_det_metrics


	def qvhighlights_eval(preds, annos):
	pred_qids = set([e['qid'] for e in preds])
	gt_qids = set([e['qid'] for e in annos])
	assert pred_qids == gt_qids, 'qids in annos and preds must match'

	eval_metrics = dict()
	eval_metrics_brief = OrderedDict()
	if 'pred_relevant_windows' in preds[0]:
	moment_ret_scores = eval_moment_retrieval(preds, annos)
	eval_metrics.update(moment_ret_scores)
	moment_ret_scores_brief = {
	'MR-full-mAP': moment_ret_scores['full']['MR-mAP']['average'],
	'[email protected]': moment_ret_scores['full']['MR-mAP']['0.5'],
	'[email protected]': moment_ret_scores['full']['MR-mAP']['0.75'],
	'MR-short-mAP': moment_ret_scores['short']['MR-mAP']['average'],
	'MR-middle-mAP': moment_ret_scores['middle']['MR-mAP']['average'],
	'MR-long-mAP': moment_ret_scores['long']['MR-mAP']['average'],
	'MR-short-mIoU': moment_ret_scores['short']['MR-mIoU'],
	'MR-middle-mIoU': moment_ret_scores['middle']['MR-mIoU'],
	'MR-long-mIoU': moment_ret_scores['long']['MR-mIoU'],
	'MR-full-mIoU': moment_ret_scores['full']['MR-mIoU'],
	'[email protected]': moment_ret_scores['full']['MR-R1']['0.3'],
	'[email protected]': moment_ret_scores['full']['MR-R1']['0.5'],
	'[email protected]': moment_ret_scores['full']['MR-R1']['0.7'],
	'[email protected]': moment_ret_scores['full']['MR-R5']['0.3'],
	'[email protected]': moment_ret_scores['full']['MR-R5']['0.5'],
	'[email protected]': moment_ret_scores['full']['MR-R5']['0.7']
	}
	eval_metrics_brief.update(sorted([(k, v) for k, v in moment_ret_scores_brief.items()], key=lambda x: x[0]))

	if ('pred_saliency_scores' in preds[0]) and ('saliency_scores' in annos[0]):
	if isinstance(annos[0]['saliency_scores'], list):
	highlight_det_scores = eval_highlight(preds, annos)
	eval_metrics.update(highlight_det_scores)
	highlight_det_scores_brief = dict([(f"{k}-{sub_k.split('-')[1]}", v[sub_k])
	for k, v in highlight_det_scores.items() for sub_k in v])
	eval_metrics_brief.update(highlight_det_scores_brief)
	eval_metrics_brief['HL-min-VeryGood-mAP'] = eval_metrics_brief.pop('HL-min-VeryGood-mAP')
	eval_metrics_brief['HL-min-VeryGood-Hit1'] = eval_metrics_brief.pop('HL-min-VeryGood-Hit1')

	final_eval_metrics = OrderedDict()
	final_eval_metrics['brief'] = eval_metrics_brief
	final_eval_metrics.update(sorted([(k, v) for k, v in eval_metrics.items()], key=lambda x: x[0]))
	return final_eval_metrics


	def parse_args():
	parser = argparse.ArgumentParser()
	parser.add_argument('pred_path')
	parser.add_argument('--anno_path', default='data/qvhighlights/highlight_val_release.jsonl')
	parser.add_argument('--out_name', default='metrics.log')
	args = parser.parse_args()
	return args


	if __name__ == '__main__':
	args = parse_args()

	if nncore.is_dir(args.pred_path):
	log_file = nncore.join(args.pred_path, args.out_name)
	else:
	log_file = nncore.same_dir(args.pred_path, args.out_name)

	nncore.set_default_logger(logger='eval', fmt=None, log_file=log_file)

	if nncore.is_dir(args.pred_path):
	pred_paths = nncore.ls(args.pred_path, ext=['json', 'jsonl'], join_path=True, sort=True)
	nncore.log(f'Total number of files: {len(pred_paths)}\n')
	preds = nncore.flatten([nncore.load(p) for p in pred_paths])
	else:
	nncore.log(f'Loading predictions from {args.pred_path}')
	preds = nncore.load(args.pred_path)

	annos = nncore.load(args.anno_path)

	res = qvhighlights_eval(preds, annos)['brief']
	for k, v in res.items():
	nncore.log(f'{k}: {v}')