File size: 1,826 Bytes
be13417
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
"""
 Copyright (c) 2023, salesforce.com, inc.
 All rights reserved.
 SPDX-License-Identifier: BSD-3-Clause
 For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""


## Pre-requisities: run 'pip install youtube-dl' to install the youtube-dl package.
## Specify your location of output videos and input json file.
## It can also be used for youcookii by updating the file paths.
import json
import os

output_path = './videos'
json_path = './COIN.json'

if not os.path.exists(output_path):
	os.mkdir(output_path)
	
data = json.load(open(json_path, 'r'))['database']
youtube_ids = list(data.keys())

for youtube_id in data:
	info = data[youtube_id]
	type = info['recipe_type']
	url = info['video_url']
	vid_loc = output_path + '/' + str(type)
	if not os.path.exists(vid_loc):
		os.mkdir(vid_loc)
	os.system('youtube-dl -o ' + vid_loc + '/' + youtube_id + '.mp4' + ' -f best ' + url)
	
	# To save disk space, you could download the best format available 
	# 	but not better that 480p or any other qualities optinally
	# See https://askubuntu.com/questions/486297/how-to-select-video-quality-from-youtube-dl

## convert annotations
all_json = json.load(open(json_path))['database']
train_data = []
test_data = []
for k,v in all_json.items():
	for gt_ann in v['annotation']:
		new_ann = {}
		youtube_id = v["video_url"].split("/")[-1]
		new_ann['youtube_id'] = youtube_id
		new_ann["recipe_type"] = v["recipe_type"]
		new_ann['video_path'] = f'{v["recipe_type"]}/{youtube_id}.mp4'
		new_ann['caption'] = gt_ann['label']
		new_ann['id'] = gt_ann['id']
		new_ann['ts'] = gt_ann['ts']
		if v['subset'] == 'training':
			train_data.append(new_ann)
		else:
			test_data.append(new_ann)
	
json.dump(train_data, open('train.json', 'w'))
json.dump(test_data, open('test.json', 'w'))