|
import os
|
|
|
|
|
|
from utils.mpr import MultipleProcessRunner
|
|
from tqdm import tqdm
|
|
|
|
|
|
class Downloader(MultipleProcessRunner):
|
|
"""
|
|
Download files that has unified resource locator
|
|
"""
|
|
|
|
def __init__(self, base_url, save_path, overwrite=False, skip_error_info=False, **kwargs):
|
|
"""
|
|
|
|
Args:
|
|
base_url: Unified Resource Locator of pdb file
|
|
save_path: Unified Resource Locator of saving path
|
|
overwrite: whether overwrite existing files
|
|
"""
|
|
super().__init__(**kwargs)
|
|
|
|
self.base_url = base_url
|
|
self.save_path = save_path
|
|
self.overwrite = overwrite
|
|
self.skip_error_info = skip_error_info
|
|
|
|
if not overwrite:
|
|
|
|
self.data = [uniprot for uniprot in tqdm(self.data, desc="Filtering out existing files...")
|
|
if not os.path.exists(self.save_path.format(uniprot))]
|
|
|
|
def _aggregate(self, final_path: str, sub_paths):
|
|
pass
|
|
|
|
def _target_static(self, process_id, data, sub_path, *args):
|
|
for i, uniprot in enumerate(data):
|
|
url = self.base_url.format(uniprot)
|
|
save_path = self.save_path.format(uniprot)
|
|
|
|
|
|
wget = f"wget -q -o /dev/null {url} -O {save_path}"
|
|
|
|
rm = f"rm {save_path}"
|
|
err = f"echo 'Error: {url} cannot be downloaded!'"
|
|
if self.skip_error_info:
|
|
err += ">/dev/null"
|
|
|
|
os.system(f"{wget} || ({rm} && {err})")
|
|
|
|
self.terminal_progress_bar(process_id, i + 1, len(data), f"Process{process_id} Downloading files...")
|
|
|
|
def run(self):
|
|
"""
|
|
Run this function to download files
|
|
"""
|
|
super().run()
|
|
|
|
def __len__(self):
|
|
return len(self.data)
|
|
|
|
@staticmethod
|
|
|
|
def clear_empty_files(path):
|
|
cnt = 0
|
|
for file in tqdm(os.listdir(path), desc="Clearing empty files..."):
|
|
if os.path.getsize(os.path.join(path, file)) == 0:
|
|
os.remove(os.path.join(path, file))
|
|
cnt += 1
|
|
print(f"Removed {cnt} empty files")
|
|
return cnt
|
|
|
|
|
|
class AlphaDBDownloader(Downloader):
|
|
"""
|
|
Download files from AlphaFold2 database
|
|
"""
|
|
def __init__(self, uniprot_ids, type: str, save_dir: str, **kwargs):
|
|
"""
|
|
|
|
Args:
|
|
uniprots: Uniprot ids
|
|
type: Which type of files to download. Must be one of ['pdb', 'mmcif', 'plddt', "pae"]
|
|
save_dir: Saving directory
|
|
**kwargs:
|
|
"""
|
|
|
|
url_dict = {
|
|
"pdb": "https://alphafold.ebi.ac.uk/files/AF-{}-F1-model_v4.pdb",
|
|
"mmcif": "https://alphafold.ebi.ac.uk/files/AF-{}-F1-model_v4.cif",
|
|
"plddt": "https://alphafold.ebi.ac.uk/files/AF-{}-F1-confidence_v4.json",
|
|
"pae": "https://alphafold.ebi.ac.uk/files/AF-{}-F1-predicted_aligned_error_v4.json"
|
|
}
|
|
|
|
save_dict = {
|
|
"pdb": "{}.pdb",
|
|
"mmcif": "{}.cif",
|
|
"plddt": "{}.json",
|
|
"pae": "{}.json"
|
|
}
|
|
base_url = url_dict[type]
|
|
save_path = os.path.join(save_dir, save_dict[type])
|
|
|
|
super().__init__(data=uniprot_ids, base_url=base_url, save_path=save_path, **kwargs)
|
|
|
|
|
|
class PDBDownloader(Downloader):
|
|
"""
|
|
Download files from PDB
|
|
"""
|
|
def __init__(self, pdb_ids, type: str, save_dir: str, **kwargs):
|
|
"""
|
|
|
|
Args:
|
|
pdb_ids: PDB ids
|
|
type: Which type of files to download. Must be one of ['pdb', 'mmcif']
|
|
save_dir: Saving directory
|
|
"""
|
|
|
|
url_dict = {
|
|
"pdb": "https://files.rcsb.org/download/{}.pdb",
|
|
"mmcif": "https://files.rcsb.org/download/{}.cif"
|
|
}
|
|
|
|
save_dict = {
|
|
"pdb": "{}.pdb",
|
|
"mmcif": "{}.cif"
|
|
}
|
|
|
|
base_url = url_dict[type]
|
|
save_path = os.path.join(save_dir, save_dict[type])
|
|
|
|
super().__init__(data=pdb_ids, base_url=base_url, save_path=save_path, **kwargs)
|
|
|
|
|
|
class CATHDownloader(Downloader):
|
|
def __init__(self, cath_ids, save_dir, **kwargs):
|
|
"""
|
|
Download files from CATH
|
|
Args:
|
|
cath_ids: CATH ids
|
|
save_dir: Saving directory
|
|
"""
|
|
|
|
url = "http://www.cathdb.info/version/v4_3_0/api/rest/id/{}.pdb"
|
|
save_path = os.path.join(save_dir, "{}.pdb")
|
|
|
|
super().__init__(data=cath_ids, base_url=url, save_path=save_path, **kwargs)
|
|
|
|
|
|
def download_pdb(pdb_id: str, format: str, save_path: str):
|
|
"""
|
|
Download pdb file from PDB
|
|
Args:
|
|
pdb_id: PDB id
|
|
format: File , must be one of ['pdb', 'cif']
|
|
save_path: Saving path
|
|
"""
|
|
|
|
url = f"https://files.rcsb.org/download/{pdb_id}.{format}"
|
|
wget = f"wget -q -o /dev/null {url} -O {save_path}"
|
|
rm = f"rm {save_path}"
|
|
err = f"echo 'Error: {url} cannot be downloaded!'"
|
|
os.system(f"{wget} || ({rm} && {err})")
|
|
|
|
|
|
def download_af2(uniprot_id: str, format: str, save_path: str):
|
|
"""
|
|
Download files from AlphaFold2 database
|
|
Args:
|
|
uniprot_id: Uniprot id
|
|
format: File format, must be one of ['pdb', 'cif', 'plddt', 'pae']
|
|
save_path: Saving path
|
|
"""
|
|
|
|
url = f"https://alphafold.ebi.ac.uk/files/AF-{uniprot_id}-F1-model_v4.{format}"
|
|
wget = f"wget -q -o /dev/null {url} -O {save_path}"
|
|
rm = f"rm {save_path}"
|
|
err = f"echo 'Error: {url} cannot be downloaded!'"
|
|
os.system(f"{wget} || ({rm} && {err})") |