|
import argparse |
|
import multiprocessing as mp |
|
import os |
|
from functools import partial |
|
from time import time as timer |
|
|
|
from pytube import YouTube |
|
from tqdm import tqdm |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument('--input_list', type=str, required=True, |
|
help='List of youtube video ids') |
|
parser.add_argument('--output_dir', type=str, default='data/youtube_videos', |
|
help='Location to download videos') |
|
parser.add_argument('--num_workers', type=int, default=8, |
|
help='How many multiprocessing workers?') |
|
args = parser.parse_args() |
|
|
|
|
|
def download_video(output_dir, video_id): |
|
r"""Download video.""" |
|
video_path = '%s/%s.mp4' % (output_dir, video_id) |
|
if not os.path.isfile(video_path): |
|
try: |
|
|
|
yt = YouTube('https://www.youtube.com/watch?v=%s' % (video_id)) |
|
stream = yt.streams.filter(subtype='mp4', only_video=True, adaptive=True).first() |
|
if stream is None: |
|
stream = yt.streams.filter(subtype='mp4').first() |
|
stream.download(output_path=output_dir, filename=video_id + '.mp4') |
|
except Exception as e: |
|
print(e) |
|
print('Failed to download %s' % (video_id)) |
|
else: |
|
print('File exists: %s' % (video_id)) |
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
video_ids = [] |
|
with open(args.input_list) as fin: |
|
for line in fin: |
|
video_ids.append(line.strip()) |
|
|
|
|
|
os.makedirs(args.output_dir, exist_ok=True) |
|
|
|
|
|
downloader = partial(download_video, args.output_dir) |
|
|
|
start = timer() |
|
pool_size = args.num_workers |
|
print('Using pool size of %d' % (pool_size)) |
|
with mp.Pool(processes=pool_size) as p: |
|
_ = list(tqdm(p.imap_unordered(downloader, video_ids), total=len(video_ids))) |
|
print('Elapsed time: %.2f' % (timer() - start)) |
|
|