import os import sys import requests from tqdm import tqdm def download_dataset(urls, dest_dir, filename=None): # source_code: https://github.com/sirbowen78/lab/blob/master/file_handling/dl_file1.py # This example script downloads python program for mac. # Home directory of Mac, pathlib.Path module make this easy. # home_path = Path.home() # This is the sub directory under home directory. # sub_path = "tmp" # The header of the dl link has a Content-Length which is in bytes. # The bytes is in string hence has to convert to integer. os.makedirs(dest_dir, exist_ok=True) for url in urls: if 'drive.google' in url: import gdown # import os # print('gdown downloadddd output: ', dest_dir ) # print(dest_dir, filename) # dest_dir = os.path.join(dest_dir,'peyma.zip') return gdown.download(url, quiet=False, output=filename) try: filesize = int(requests.head(url).headers["Content-Length"]) except KeyError: print('unknown file length') filesize = -1 # os.path.basename returns python-3.8.5-macosx10.9.pkg, # without this module I will have to manually split the url by "/" # then get the last index with -1. # Example: # url.split("/")[-1] filename = os.path.basename(url) # make the sub directory, exists_ok=True will not have exception if the sub dir does not exists. # the dir will be created if not exists. os.makedirs(dest_dir, exist_ok=True) # The absolute path to download the python program to. dl_path = os.path.join(dest_dir, filename) chunk_size = 1024 if os.path.exists(dl_path): print(f'file {dl_path} already exist') return dl_path # Use the requests.get with stream enable, with iter_content by chunk size, # the contents will be written to the dl_path. # tqdm tracks the progress by progress.update(datasize) with requests.get(url, stream=True) as r, open(dl_path, "wb") as f, tqdm( unit="B", # unit string to be displayed. unit_scale=True, # let tqdm to determine the scale in kilo, mega..etc. unit_divisor=1024, # is used when unit_scale is true total=filesize, # the total iteration. file=sys.stdout, # default goes to stderr, this is the display on console. desc=filename # prefix to be displayed on progress bar. ) as progress: for chunk in r.iter_content(chunk_size=chunk_size): # download the file chunk by chunk datasize = f.write(chunk) # on each chunk update the progress bar. progress.update(datasize) return True