Spaces:
No application file
No application file
# Copyright (C) 2002, Thomas Hamelryck ([email protected]) | |
# | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
r"""Use the DSSP program to calculate secondary structure and accessibility. | |
You need to have a working version of DSSP (and a license, free for academic | |
use) in order to use this. For DSSP, see https://swift.cmbi.umcn.nl/gv/dssp/. | |
The following Accessible surface area (ASA) values can be used, defaulting | |
to the Sander and Rost values: | |
Ahmad | |
Ahmad et al. 2003 https://doi.org/10.1002/prot.10328 | |
Miller | |
Miller et al. 1987 https://doi.org/10.1016/0022-2836(87)90038-6 | |
Sander | |
Sander and Rost 1994 https://doi.org/10.1002/prot.340200303 | |
Wilke | |
Tien et al. 2013 https://doi.org/10.1371/journal.pone.0080635 | |
The DSSP codes for secondary structure used here are: | |
===== ==== | |
Code Structure | |
===== ==== | |
H Alpha helix (4-12) | |
B Isolated beta-bridge residue | |
E Strand | |
G 3-10 helix | |
I Pi helix | |
T Turn | |
S Bend | |
\- None | |
===== ==== | |
Usage | |
----- | |
The DSSP class can be used to run DSSP on a PDB or mmCIF file, and provides a | |
handle to the DSSP secondary structure and accessibility. | |
**Note** that DSSP can only handle one model, and will only run | |
calculations on the first model in the provided PDB file. | |
Examples | |
-------- | |
Typical use:: | |
from Bio.PDB import PDBParser | |
from Bio.PDB.DSSP import DSSP | |
p = PDBParser() | |
structure = p.get_structure("1MOT", "/local-pdb/1mot.pdb") | |
model = structure[0] | |
dssp = DSSP(model, "/local-pdb/1mot.pdb") | |
Note that the recent DSSP executable from the DSSP-2 package was | |
renamed from ``dssp`` to ``mkdssp``. If using a recent DSSP release, | |
you may need to provide the name of your DSSP executable:: | |
dssp = DSSP(model, '/local-pdb/1mot.pdb', dssp='mkdssp') | |
DSSP data is accessed by a tuple - (chain id, residue id):: | |
a_key = list(dssp.keys())[2] | |
dssp[a_key] | |
The dssp data returned for a single residue is a tuple in the form: | |
============ === | |
Tuple Index Value | |
============ === | |
0 DSSP index | |
1 Amino acid | |
2 Secondary structure | |
3 Relative ASA | |
4 Phi | |
5 Psi | |
6 NH-->O_1_relidx | |
7 NH-->O_1_energy | |
8 O-->NH_1_relidx | |
9 O-->NH_1_energy | |
10 NH-->O_2_relidx | |
11 NH-->O_2_energy | |
12 O-->NH_2_relidx | |
13 O-->NH_2_energy | |
============ === | |
""" | |
import re | |
import os | |
from io import StringIO | |
import subprocess | |
import warnings | |
from Bio.PDB.AbstractPropertyMap import AbstractResiduePropertyMap | |
from Bio.PDB.PDBExceptions import PDBException | |
from Bio.PDB.PDBParser import PDBParser | |
from Bio.Data.PDBData import protein_letters_3to1, residue_sasa_scales | |
from Bio.PDB.MMCIF2Dict import MMCIF2Dict | |
# Match C in DSSP | |
_dssp_cys = re.compile("[a-z]") | |
# Maximal ASA of amino acids | |
# Used for relative accessibility | |
residue_max_acc = residue_sasa_scales | |
def version(version_string): | |
"""Parse semantic version scheme for easy comparison.""" | |
return tuple(map(int, (version_string.split(".")))) | |
def ss_to_index(ss): | |
"""Secondary structure symbol to index. | |
H=0 | |
E=1 | |
C=2 | |
""" | |
if ss == "H": | |
return 0 | |
if ss == "E": | |
return 1 | |
if ss == "C": | |
return 2 | |
assert 0 | |
def dssp_dict_from_pdb_file(in_file, DSSP="dssp", dssp_version="3.9.9"): | |
"""Create a DSSP dictionary from a PDB file. | |
Parameters | |
---------- | |
in_file : string | |
pdb file | |
DSSP : string | |
DSSP executable (argument to subprocess) | |
dssp_version : string | |
Version of DSSP excutable | |
Returns | |
------- | |
(out_dict, keys) : tuple | |
a dictionary that maps (chainid, resid) to | |
amino acid type, secondary structure code and | |
accessibility. | |
Examples | |
-------- | |
How dssp_dict_from_pdb_file could be used:: | |
from Bio.PDB.DSSP import dssp_dict_from_pdb_file | |
dssp_tuple = dssp_dict_from_pdb_file("/local-pdb/1fat.pdb") | |
dssp_dict = dssp_tuple[0] | |
print(dssp_dict['A',(' ', 1, ' ')]) | |
""" | |
# Using universal newlines is important on Python 3, this | |
# gives text handles rather than bytes handles. | |
# Newer version of DSSP executable is named 'mkdssp', | |
# and calling 'dssp' will hence not work in some operating systems | |
# (Debian distribution of DSSP includes a symlink for 'dssp' argument) | |
try: | |
if version(dssp_version) < version("4.0.0"): | |
DSSP_cmd = [DSSP, in_file] | |
else: | |
DSSP_cmd = [DSSP, "--output-format=dssp", in_file] | |
p = subprocess.Popen( | |
DSSP_cmd, | |
universal_newlines=True, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
) | |
except FileNotFoundError: | |
if DSSP == "mkdssp": | |
raise | |
if version(dssp_version) < version("4.0.0"): | |
DSSP_cmd = ["mkdssp", in_file] | |
else: | |
DSSP_cmd = ["mkdssp", "--output-format=dssp", in_file] | |
p = subprocess.Popen( | |
DSSP_cmd, | |
universal_newlines=True, | |
stdout=subprocess.PIPE, | |
stderr=subprocess.PIPE, | |
) | |
out, err = p.communicate() | |
# Alert user for errors | |
if err.strip(): | |
warnings.warn(err) | |
if not out.strip(): | |
raise Exception("DSSP failed to produce an output") | |
out_dict, keys = _make_dssp_dict(StringIO(out)) | |
return out_dict, keys | |
def make_dssp_dict(filename): | |
"""DSSP dictionary mapping identifiers to properties. | |
Return a DSSP dictionary that maps (chainid, resid) to | |
aa, ss and accessibility, from a DSSP file. | |
Parameters | |
---------- | |
filename : string | |
the DSSP output file | |
""" | |
with open(filename) as handle: | |
return _make_dssp_dict(handle) | |
def _make_dssp_dict(handle): | |
"""Return a DSSP dictionary, used by mask_dssp_dict (PRIVATE). | |
DSSP dictionary maps (chainid, resid) to an amino acid, | |
secondary structure symbol, solvent accessibility value, and hydrogen bond | |
information (relative dssp indices and hydrogen bond energies) from an open | |
DSSP file object. | |
Parameters | |
---------- | |
handle : file | |
the open DSSP output file handle | |
""" | |
dssp = {} | |
start = 0 | |
keys = [] | |
for line in handle: | |
sl = line.split() | |
if len(sl) < 2: | |
continue | |
if sl[1] == "RESIDUE": | |
# Start parsing from here | |
start = 1 | |
continue | |
if not start: | |
continue | |
if line[9] == " ": | |
# Skip -- missing residue | |
continue | |
dssp_index = int(line[:5]) | |
resseq = int(line[5:10]) | |
icode = line[10] | |
chainid = line[11] | |
aa = line[13] | |
ss = line[16] | |
if ss == " ": | |
ss = "-" | |
try: | |
NH_O_1_relidx = int(line[38:45]) | |
NH_O_1_energy = float(line[46:50]) | |
O_NH_1_relidx = int(line[50:56]) | |
O_NH_1_energy = float(line[57:61]) | |
NH_O_2_relidx = int(line[61:67]) | |
NH_O_2_energy = float(line[68:72]) | |
O_NH_2_relidx = int(line[72:78]) | |
O_NH_2_energy = float(line[79:83]) | |
acc = int(line[34:38]) | |
phi = float(line[103:109]) | |
psi = float(line[109:115]) | |
except ValueError as exc: | |
# DSSP output breaks its own format when there are >9999 | |
# residues, since only 4 digits are allocated to the seq num | |
# field. See 3kic chain T res 321, 1vsy chain T res 6077. | |
# Here, look for whitespace to figure out the number of extra | |
# digits, and shift parsing the rest of the line by that amount. | |
if line[34] != " ": | |
shift = line[34:].find(" ") | |
NH_O_1_relidx = int(line[38 + shift : 45 + shift]) | |
NH_O_1_energy = float(line[46 + shift : 50 + shift]) | |
O_NH_1_relidx = int(line[50 + shift : 56 + shift]) | |
O_NH_1_energy = float(line[57 + shift : 61 + shift]) | |
NH_O_2_relidx = int(line[61 + shift : 67 + shift]) | |
NH_O_2_energy = float(line[68 + shift : 72 + shift]) | |
O_NH_2_relidx = int(line[72 + shift : 78 + shift]) | |
O_NH_2_energy = float(line[79 + shift : 83 + shift]) | |
acc = int(line[34 + shift : 38 + shift]) | |
phi = float(line[103 + shift : 109 + shift]) | |
psi = float(line[109 + shift : 115 + shift]) | |
else: | |
raise ValueError(exc) from None | |
res_id = (" ", resseq, icode) | |
dssp[(chainid, res_id)] = ( | |
aa, | |
ss, | |
acc, | |
phi, | |
psi, | |
dssp_index, | |
NH_O_1_relidx, | |
NH_O_1_energy, | |
O_NH_1_relidx, | |
O_NH_1_energy, | |
NH_O_2_relidx, | |
NH_O_2_energy, | |
O_NH_2_relidx, | |
O_NH_2_energy, | |
) | |
keys.append((chainid, res_id)) | |
return dssp, keys | |
class DSSP(AbstractResiduePropertyMap): | |
"""Run DSSP and parse secondary structure and accessibility. | |
Run DSSP on a PDB/mmCIF file, and provide a handle to the | |
DSSP secondary structure and accessibility. | |
**Note** that DSSP can only handle one model. | |
Examples | |
-------- | |
How DSSP could be used:: | |
from Bio.PDB import PDBParser | |
from Bio.PDB.DSSP import DSSP | |
p = PDBParser() | |
structure = p.get_structure("1MOT", "/local-pdb/1mot.pdb") | |
model = structure[0] | |
dssp = DSSP(model, "/local-pdb/1mot.pdb") | |
# DSSP data is accessed by a tuple (chain_id, res_id) | |
a_key = list(dssp.keys())[2] | |
# (dssp index, amino acid, secondary structure, relative ASA, phi, psi, | |
# NH_O_1_relidx, NH_O_1_energy, O_NH_1_relidx, O_NH_1_energy, | |
# NH_O_2_relidx, NH_O_2_energy, O_NH_2_relidx, O_NH_2_energy) | |
dssp[a_key] | |
""" | |
def __init__(self, model, in_file, dssp="dssp", acc_array="Sander", file_type=""): | |
"""Create a DSSP object. | |
Parameters | |
---------- | |
model : Model | |
The first model of the structure | |
in_file : string | |
Either a PDB file or a DSSP file. | |
dssp : string | |
The dssp executable (ie. the argument to subprocess) | |
acc_array : string | |
Accessible surface area (ASA) from either Miller et al. (1987), | |
Sander & Rost (1994), Wilke: Tien et al. 2013, or Ahmad et al. | |
(2003) as string Sander/Wilke/Miller/Ahmad. Defaults to Sander. | |
file_type: string | |
File type switch: either PDB, MMCIF or DSSP. Inferred from the | |
file extension by default. | |
""" | |
self.residue_max_acc = residue_max_acc[acc_array] | |
# create DSSP dictionary | |
if file_type == "": | |
file_type = os.path.splitext(in_file)[1][1:] | |
file_type = file_type.upper() | |
if file_type == "CIF": | |
file_type = "MMCIF" | |
assert file_type in [ | |
"PDB", | |
"MMCIF", | |
"DSSP", | |
], "File type must be PDB, mmCIF or DSSP" | |
# If the input file is a PDB or mmCIF file run DSSP and parse output: | |
if file_type == "PDB" or file_type == "MMCIF": | |
# Newer versions of DSSP program call the binary 'mkdssp', so | |
# calling 'dssp' will not work in some operating systems | |
# (Debian distribution of DSSP includes a symlink for 'dssp' argument) | |
try: | |
version_string = subprocess.check_output( | |
[dssp, "--version"], universal_newlines=True | |
) | |
dssp_version = re.search(r"\s*([\d.]+)", version_string).group(1) | |
dssp_dict, dssp_keys = dssp_dict_from_pdb_file( | |
in_file, dssp, dssp_version | |
) | |
except FileNotFoundError: | |
if dssp == "dssp": | |
dssp = "mkdssp" | |
elif dssp == "mkdssp": | |
dssp = "dssp" | |
else: | |
raise | |
version_string = subprocess.check_output( | |
[dssp, "--version"], universal_newlines=True | |
) | |
dssp_version = re.search(r"\s*([\d.]+)", version_string).group(1) | |
dssp_dict, dssp_keys = dssp_dict_from_pdb_file( | |
in_file, dssp, dssp_version | |
) | |
# If the input file is a DSSP file just parse it directly: | |
elif file_type == "DSSP": | |
dssp_dict, dssp_keys = make_dssp_dict(in_file) | |
dssp_map = {} | |
dssp_list = [] | |
def resid2code(res_id): | |
"""Serialize a residue's resseq and icode for easy comparison.""" | |
return f"{res_id[1]}{res_id[2]}" | |
# DSSP outputs label_asym_id from the mmCIF file as the chain ID | |
# But MMCIFParser reads in the auth_asym_id | |
# Here we create a dictionary to map label_asym_id to auth_asym_id | |
# using the mmCIF file | |
if file_type == "MMCIF" and version(dssp_version) < version("4.0.0"): | |
mmcif_dict = MMCIF2Dict(in_file) | |
mmcif_chain_dict = {} | |
for i, c in enumerate(mmcif_dict["_atom_site.label_asym_id"]): | |
if c not in mmcif_chain_dict: | |
mmcif_chain_dict[c] = mmcif_dict["_atom_site.auth_asym_id"][i] | |
dssp_mapped_keys = [] | |
# Now create a dictionary that maps Residue objects to | |
# secondary structure and accessibility, and a list of | |
# (residue, (secondary structure, accessibility)) tuples | |
for key in dssp_keys: | |
chain_id, res_id = key | |
if file_type == "MMCIF" and version(dssp_version) < version("4.0.0"): | |
chain_id = mmcif_chain_dict[chain_id] | |
dssp_mapped_keys.append((chain_id, res_id)) | |
chain = model[chain_id] | |
try: | |
res = chain[res_id] | |
except KeyError: | |
# In DSSP, HET field is not considered in residue identifier. | |
# Thus HETATM records may cause unnecessary exceptions. | |
# (See 3jui chain A res 593.) | |
# Try the lookup again with all HETATM other than water | |
res_seq_icode = resid2code(res_id) | |
for r in chain: | |
if r.id[0] not in (" ", "W"): | |
# Compare resseq + icode | |
if resid2code(r.id) == res_seq_icode: | |
# Found a matching residue | |
res = r | |
break | |
else: | |
raise KeyError(res_id) from None | |
# For disordered residues of point mutations, Biopython uses the | |
# last one as default, But DSSP takes the first one (alternative | |
# location is blank, A or 1). See 1h9h chain E resi 22. | |
# Here we select the res in which all atoms have altloc blank, A or | |
# 1. If no such residues are found, simply use the first one appears | |
# (as DSSP does). | |
if res.is_disordered() == 2: | |
for rk in res.disordered_get_id_list(): | |
# All atoms in the disordered residue should have the same | |
# altloc, so it suffices to check the altloc of the first | |
# atom. | |
altloc = res.child_dict[rk].get_list()[0].get_altloc() | |
if altloc in tuple("A1 "): | |
res.disordered_select(rk) | |
break | |
else: | |
# Simply select the first one | |
res.disordered_select(res.disordered_get_id_list()[0]) | |
# Sometimes point mutations are put into HETATM and ATOM with altloc | |
# 'A' and 'B'. | |
# See 3piu chain A residue 273: | |
# <Residue LLP het=H_LLP resseq=273 icode= > | |
# <Residue LYS het= resseq=273 icode= > | |
# DSSP uses the HETATM LLP as it has altloc 'A' | |
# We check the altloc code here. | |
elif res.is_disordered() == 1: | |
# Check altloc of all atoms in the DisorderedResidue. If it | |
# contains blank, A or 1, then use it. Otherwise, look for HET | |
# residues of the same seq+icode. If not such HET residues are | |
# found, just accept the current one. | |
altlocs = {a.get_altloc() for a in res.get_unpacked_list()} | |
if altlocs.isdisjoint("A1 "): | |
# Try again with all HETATM other than water | |
res_seq_icode = resid2code(res_id) | |
for r in chain: | |
if r.id[0] not in (" ", "W"): | |
if resid2code(r.id) == res_seq_icode and r.get_list()[ | |
0 | |
].get_altloc() in tuple("A1 "): | |
res = r | |
break | |
( | |
aa, | |
ss, | |
acc, | |
phi, | |
psi, | |
dssp_index, | |
NH_O_1_relidx, | |
NH_O_1_energy, | |
O_NH_1_relidx, | |
O_NH_1_energy, | |
NH_O_2_relidx, | |
NH_O_2_energy, | |
O_NH_2_relidx, | |
O_NH_2_energy, | |
) = dssp_dict[key] | |
res.xtra["SS_DSSP"] = ss | |
res.xtra["EXP_DSSP_ASA"] = acc | |
res.xtra["PHI_DSSP"] = phi | |
res.xtra["PSI_DSSP"] = psi | |
res.xtra["DSSP_INDEX"] = dssp_index | |
res.xtra["NH_O_1_RELIDX_DSSP"] = NH_O_1_relidx | |
res.xtra["NH_O_1_ENERGY_DSSP"] = NH_O_1_energy | |
res.xtra["O_NH_1_RELIDX_DSSP"] = O_NH_1_relidx | |
res.xtra["O_NH_1_ENERGY_DSSP"] = O_NH_1_energy | |
res.xtra["NH_O_2_RELIDX_DSSP"] = NH_O_2_relidx | |
res.xtra["NH_O_2_ENERGY_DSSP"] = NH_O_2_energy | |
res.xtra["O_NH_2_RELIDX_DSSP"] = O_NH_2_relidx | |
res.xtra["O_NH_2_ENERGY_DSSP"] = O_NH_2_energy | |
# Relative accessibility | |
resname = res.get_resname() | |
try: | |
rel_acc = acc / self.residue_max_acc[resname] | |
except KeyError: | |
# Invalid value for resname | |
rel_acc = "NA" | |
else: | |
if rel_acc > 1.0: | |
rel_acc = 1.0 | |
res.xtra["EXP_DSSP_RASA"] = rel_acc | |
# Verify if AA in DSSP == AA in Structure | |
# Something went wrong if this is not true! | |
# NB: DSSP uses X often | |
resname = protein_letters_3to1.get(resname, "X") | |
if resname == "C": | |
# DSSP renames C in C-bridges to a,b,c,d,... | |
# - we rename it back to 'C' | |
if _dssp_cys.match(aa): | |
aa = "C" | |
# Take care of HETATM again | |
if (resname != aa) and (res.id[0] == " " or aa != "X"): | |
raise PDBException(f"Structure/DSSP mismatch at {res}") | |
dssp_vals = ( | |
dssp_index, | |
aa, | |
ss, | |
rel_acc, | |
phi, | |
psi, | |
NH_O_1_relidx, | |
NH_O_1_energy, | |
O_NH_1_relidx, | |
O_NH_1_energy, | |
NH_O_2_relidx, | |
NH_O_2_energy, | |
O_NH_2_relidx, | |
O_NH_2_energy, | |
) | |
dssp_map[(chain_id, res_id)] = dssp_vals | |
dssp_list.append(dssp_vals) | |
if file_type == "MMCIF" and version(dssp_version) < version("4.0.0"): | |
dssp_keys = dssp_mapped_keys | |
AbstractResiduePropertyMap.__init__(self, dssp_map, dssp_keys, dssp_list) | |