import argparse |
import multiprocessing as mp |
import os |
from functools import partial |
from time import time as timer |
from pytube import YouTube |
from tqdm import tqdm |
parser = argparse.ArgumentParser() |
parser.add_argument('--input_list', type=str, required=True, |
help='List of youtube video ids') |
parser.add_argument('--output_dir', type=str, default='data/youtube_videos', |
help='Location to download videos') |
parser.add_argument('--num_workers', type=int, default=8, |
help='How many multiprocessing workers?') |
args = parser.parse_args() |
def download_video(output_dir, video_id): |
r"""Download video.""" |
video_path = '%s/%s.mp4' % (output_dir, video_id) |
if not os.path.isfile(video_path): |
try: |
yt = YouTube('https://www.youtube.com/watch?v=%s' % (video_id)) |
stream = yt.streams.filter(subtype='mp4', only_video=True, adaptive=True).first() |
if stream is None: |
stream = yt.streams.filter(subtype='mp4').first() |
stream.download(output_path=output_dir, filename=video_id + '.mp4') |
except Exception as e: |
print(e) |
print('Failed to download %s' % (video_id)) |
else: |
print('File exists: %s' % (video_id)) |
if __name__ == '__main__': |
video_ids = [] |
with open(args.input_list) as fin: |
for line in fin: |
video_ids.append(line.strip()) |
os.makedirs(args.output_dir, exist_ok=True) |
downloader = partial(download_video, args.output_dir) |
start = timer() |
pool_size = args.num_workers |
print('Using pool size of %d' % (pool_size)) |
with mp.Pool(processes=pool_size) as p: |
_ = list(tqdm(p.imap_unordered(downloader, video_ids), total=len(video_ids))) |
print('Elapsed time: %.2f' % (timer() - start)) |