import os from utils.mpr import MultipleProcessRunner from tqdm import tqdm class Downloader(MultipleProcessRunner): """ Download files that has unified resource locator """ def __init__(self, base_url, save_path, overwrite=False, skip_error_info=False, **kwargs): """ Args: base_url: Unified Resource Locator of pdb file save_path: Unified Resource Locator of saving path overwrite: whether overwrite existing files """ super().__init__(**kwargs) self.base_url = base_url self.save_path = save_path self.overwrite = overwrite self.skip_error_info = skip_error_info if not overwrite: # remove existing files in data self.data = [uniprot for uniprot in tqdm(self.data, desc="Filtering out existing files...") if not os.path.exists(self.save_path.format(uniprot))] def _aggregate(self, final_path: str, sub_paths): pass def _target_static(self, process_id, data, sub_path, *args): for i, uniprot in enumerate(data): url = self.base_url.format(uniprot) save_path = self.save_path.format(uniprot) # shell cmd to download files wget = f"wget -q -o /dev/null {url} -O {save_path}" rm = f"rm {save_path}" err = f"echo 'Error: {url} cannot be downloaded!'" if self.skip_error_info: err += ">/dev/null" os.system(f"{wget} || ({rm} && {err})") self.terminal_progress_bar(process_id, i + 1, len(data), f"Process{process_id} Downloading files...") def run(self): """ Run this function to download files """ super().run() def __len__(self): return len(self.data) @staticmethod # Clear empty files in specific directory def clear_empty_files(path): cnt = 0 for file in tqdm(os.listdir(path), desc="Clearing empty files..."): if os.path.getsize(os.path.join(path, file)) == 0: os.remove(os.path.join(path, file)) cnt += 1 print(f"Removed {cnt} empty files") return cnt class AlphaDBDownloader(Downloader): """ Download files from AlphaFold2 database """ def __init__(self, uniprot_ids, type: str, save_dir: str, **kwargs): """ Args: uniprots: Uniprot ids type: Which type of files to download. Must be one of ['pdb', 'mmcif', 'plddt', "pae"] save_dir: Saving directory **kwargs: """ url_dict = { "pdb": "https://alphafold.ebi.ac.uk/files/AF-{}-F1-model_v4.pdb", "mmcif": "https://alphafold.ebi.ac.uk/files/AF-{}-F1-model_v4.cif", "plddt": "https://alphafold.ebi.ac.uk/files/AF-{}-F1-confidence_v4.json", "pae": "https://alphafold.ebi.ac.uk/files/AF-{}-F1-predicted_aligned_error_v4.json" } save_dict = { "pdb": "{}.pdb", "mmcif": "{}.cif", "plddt": "{}.json", "pae": "{}.json" } base_url = url_dict[type] save_path = os.path.join(save_dir, save_dict[type]) super().__init__(data=uniprot_ids, base_url=base_url, save_path=save_path, **kwargs) class PDBDownloader(Downloader): """ Download files from PDB """ def __init__(self, pdb_ids, type: str, save_dir: str, **kwargs): """ Args: pdb_ids: PDB ids type: Which type of files to download. Must be one of ['pdb', 'mmcif'] save_dir: Saving directory """ url_dict = { "pdb": "https://files.rcsb.org/download/{}.pdb", "mmcif": "https://files.rcsb.org/download/{}.cif" } save_dict = { "pdb": "{}.pdb", "mmcif": "{}.cif" } base_url = url_dict[type] save_path = os.path.join(save_dir, save_dict[type]) super().__init__(data=pdb_ids, base_url=base_url, save_path=save_path, **kwargs) class CATHDownloader(Downloader): def __init__(self, cath_ids, save_dir, **kwargs): """ Download files from CATH Args: cath_ids: CATH ids save_dir: Saving directory """ url = "http://www.cathdb.info/version/v4_3_0/api/rest/id/{}.pdb" save_path = os.path.join(save_dir, "{}.pdb") super().__init__(data=cath_ids, base_url=url, save_path=save_path, **kwargs) def download_pdb(pdb_id: str, format: str, save_path: str): """ Download pdb file from PDB Args: pdb_id: PDB id format: File , must be one of ['pdb', 'cif'] save_path: Saving path """ url = f"https://files.rcsb.org/download/{pdb_id}.{format}" wget = f"wget -q -o /dev/null {url} -O {save_path}" rm = f"rm {save_path}" err = f"echo 'Error: {url} cannot be downloaded!'" os.system(f"{wget} || ({rm} && {err})") def download_af2(uniprot_id: str, format: str, save_path: str): """ Download files from AlphaFold2 database Args: uniprot_id: Uniprot id format: File format, must be one of ['pdb', 'cif', 'plddt', 'pae'] save_path: Saving path """ url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.{format}" wget = f"wget -q -o /dev/null {url} -O {save_path}" rm = f"rm {save_path}" err = f"echo 'Error: {url} cannot be downloaded!'" os.system(f"{wget} || ({rm} && {err})")