Spaces:
No application file
No application file
# Copyright 2019-2022 by Robert T. Miller. All rights reserved. | |
# This file is part of the Biopython distribution and governed by your | |
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
# Please see the LICENSE file that should have been included as part of this | |
# package. | |
"""Convert XYZ Structure to internal coordinates and back, test result.""" | |
import re | |
import numpy as np | |
from itertools import zip_longest | |
try: | |
import numpy | |
except ImportError: | |
from Bio import MissingPythonDependencyError | |
raise MissingPythonDependencyError( | |
"Install NumPy to build proteins from internal coordinates." | |
) | |
from Bio.PDB.PDBExceptions import PDBException | |
from io import StringIO | |
from Bio.File import as_handle | |
from Bio.PDB.PDBIO import PDBIO | |
from Bio.PDB.Structure import Structure | |
from Bio.PDB.internal_coords import IC_Residue | |
from Bio.PDB.PICIO import write_PIC, read_PIC, enumerate_atoms, pdb_date | |
# for typing | |
from typing import Dict, Union, Any, Tuple | |
from Bio.PDB.Atom import Atom | |
from Bio.PDB.Residue import Residue, DisorderedResidue | |
from Bio.PDB.Model import Model | |
from Bio.PDB.Chain import Chain | |
def structure_rebuild_test(entity, verbose: bool = False, quick: bool = False) -> Dict: | |
"""Test rebuild PDB structure from internal coordinates. | |
Generates internal coordinates for entity and writes to a .pic file in | |
memory, then generates XYZ coordinates from the .pic file and compares the | |
resulting entity against the original. | |
See :data:`IC_Residue.pic_accuracy` to vary numeric accuracy of the | |
intermediate .pic file if the only issue is small differences in coordinates. | |
Note that with default settings, deuterated initial structures will fail | |
the comparison, as will structures loaded with alternate `IC_Residue.accept_atoms` | |
settings. Use `quick=True` and/or variations on `AtomKey.d2h` and | |
`IC_Residue.accept_atoms` settings. | |
:param Entity entity: Biopython Structure, Model or Chain. | |
Structure to test | |
:param bool verbose: default False. | |
print extra messages | |
:param bool quick: default False. | |
only check the internal coords atomArrays are identical | |
:returns: dict | |
comparison dict from :func:`.compare_residues` | |
""" | |
sp = StringIO() | |
entity.atom_to_internal_coordinates(verbose) | |
write_PIC(entity, sp) | |
sp.seek(0) | |
pdb2 = read_PIC(sp, verbose=verbose, quick=quick) | |
if isinstance(entity, Chain): | |
pdb2 = next(pdb2.get_chains()) # there's only one, get first | |
if verbose: | |
report_IC(pdb2, verbose=True) | |
pdb2.internal_to_atom_coordinates(verbose) | |
r = compare_residues(entity, pdb2, verbose=verbose, quick=quick) | |
return r | |
def report_IC( | |
entity: Union[Structure, Model, Chain, Residue], | |
reportDict: Dict[str, Any] = None, | |
verbose: bool = False, | |
) -> Dict[str, Any]: | |
"""Generate dict with counts of ic data elements for each entity level. | |
reportDict entries are: | |
- idcode : PDB ID | |
- hdr : PDB header lines | |
- mdl : models | |
- chn : chains | |
- res : residue objects | |
- res_e : residues with dihedra and/or hedra | |
- dih : dihedra | |
- hed : hedra | |
:param Entity entity: Biopython PDB Entity object: S, M, C or R | |
:raises PDBException: if entity level not S, M, C, or R | |
:raises Exception: if entity does not have .level attribute | |
:returns: dict with counts of IC data elements | |
""" | |
if reportDict is None: | |
reportDict = { | |
"idcode": None, | |
"hdr": 0, | |
"mdl": 0, | |
"chn": 0, | |
"chn_ids": [], | |
"res": 0, | |
"res_e": 0, | |
"dih": 0, | |
"hed": 0, | |
} | |
try: | |
if "A" == entity.level: | |
raise PDBException("No IC output at Atom level") | |
elif isinstance(entity, Residue) or isinstance( | |
entity, DisorderedResidue | |
): # "R" == entity.level: | |
if entity.internal_coord: | |
reportDict["res"] += 1 | |
dlen = len(entity.internal_coord.dihedra) | |
hlen = len(entity.internal_coord.hedra) | |
if 0 < dlen or 0 < hlen: | |
reportDict["res_e"] += 1 | |
reportDict["dih"] += dlen | |
reportDict["hed"] += hlen | |
elif isinstance(entity, Chain): # "C" == entity.level: | |
reportDict["chn"] += 1 | |
reportDict["chn_ids"].append(entity.id) | |
for res in entity: | |
reportDict = report_IC(res, reportDict) | |
elif isinstance(entity, Model): # "M" == entity.level: | |
reportDict["mdl"] += 1 | |
for chn in entity: | |
reportDict = report_IC(chn, reportDict) | |
elif isinstance(entity, Structure): # "S" == entity.level: | |
if hasattr(entity, "header"): | |
if reportDict["idcode"] is None: | |
reportDict["idcode"] = entity.header.get("idcode", None) | |
hdr = entity.header.get("head", None) | |
if hdr: | |
reportDict["hdr"] += 1 | |
nam = entity.header.get("name", None) | |
if nam: | |
reportDict["hdr"] += 1 | |
for mdl in entity: | |
reportDict = report_IC(mdl, reportDict) | |
else: | |
raise PDBException("Cannot identify level: " + str(entity.level)) | |
except KeyError: | |
raise Exception( | |
"write_PIC: argument is not a Biopython PDB Entity " + str(entity) | |
) | |
if verbose: | |
print( | |
"{} : {} models {} chains {} {} residue objects " | |
"{} residues with {} dihedra {} hedra".format( | |
reportDict["idcode"], | |
reportDict["mdl"], | |
reportDict["chn"], | |
reportDict["chn_ids"], | |
reportDict["res"], | |
reportDict["res_e"], | |
reportDict["dih"], | |
reportDict["hed"], | |
) | |
) | |
return reportDict | |
def IC_duplicate(entity) -> Structure: | |
"""Duplicate structure entity with IC data, no atom coordinates. | |
Employs :func:`.write_PIC`, :func:`.read_PIC` with StringIO buffer. | |
Calls :meth:`.Chain.atom_to_internal_coordinates` if needed. | |
:param Entity entity: Biopython PDB Entity (will fail for Atom) | |
:returns: Biopython PDBStructure, no Atom objects except initial coords | |
""" | |
sp = StringIO() | |
hasInternalCoords = False | |
for res in entity.get_residues(): | |
if res.internal_coord: | |
if len(res.internal_coord.hedra) > 0: | |
hasInternalCoords = True | |
break | |
if not hasInternalCoords: | |
if isinstance(entity, Residue): # "R" == entity.level: | |
# works better at chain level but leave option here | |
res = entity | |
if not res.internal_coord: | |
res.internal_coord = IC_Residue(entity) | |
res.internal_coord.atom_to_internal_coordinates() | |
else: | |
entity.atom_to_internal_coordinates() | |
write_PIC(entity, sp) | |
sp.seek(0) | |
return read_PIC(sp) | |
def _atmfid_d2h(atm: Atom) -> Tuple: | |
afid = list(atm.get_full_id()) | |
afid4 = list(afid[4]) | |
afid40 = re.sub("D", "H", afid4[0], count=1) | |
new_afid = (afid[0], afid[1], afid[2], afid[3], (afid40, afid4[1])) | |
return tuple(new_afid) | |
def _cmp_atm( | |
r0: Residue, | |
r1: Residue, | |
a0: Atom, | |
a1: Atom, | |
verbose: bool, | |
cmpdict: Dict, | |
rtol: float = None, | |
atol: float = None, | |
) -> None: | |
cmpdict["aCount"] += 1 | |
if a0 is None: | |
if verbose: | |
print( | |
r1.get_full_id(), | |
"None !=", | |
a1.get_full_id(), | |
a1.parent.resname, | |
) | |
elif a1 is None: | |
if verbose: | |
print( | |
r0.get_full_id(), | |
a0.get_full_id(), | |
a0.parent.resname, | |
"!= None", | |
) | |
else: | |
if a0.get_full_id() == a1.get_full_id() or _atmfid_d2h(a0) == a1.get_full_id(): | |
cmpdict["aFullIdMatchCount"] += 1 | |
elif verbose: | |
print( | |
r0.get_full_id(), | |
a0.get_full_id(), | |
a0.parent.resname, | |
"!=", | |
a1.get_full_id(), | |
) | |
ac_rslt = False | |
if rtol is None and atol is None: | |
a0c = numpy.round(a0.get_coord(), 3) | |
a1c = numpy.round(a1.get_coord(), 3) | |
ac_rslt = numpy.array_equal(a0c, a1c) | |
else: | |
a0c = a0.get_coord() | |
a1c = a1.get_coord() | |
ac_rslt = numpy.allclose(a0c, a1c, rtol=rtol, atol=atol) | |
if ac_rslt: | |
cmpdict["aCoordMatchCount"] += 1 | |
elif verbose: | |
print( | |
"atom coords disagree:", | |
r0.get_full_id(), | |
a0.get_full_id(), | |
a1.get_full_id(), | |
a0c, | |
"!=", | |
a1c, | |
) | |
def _cmp_res( | |
r0: Residue, | |
r1: Residue, | |
verbose: bool, | |
cmpdict: Dict, | |
rtol: float = None, | |
atol: float = None, | |
) -> None: | |
r0id, r0fid, r1fid = r0.id, r0.full_id, r1.full_id | |
chn = r0.parent.id | |
if chn not in cmpdict["chains"]: | |
cmpdict["chains"].append(chn) | |
cmpdict["rCount"] += 1 | |
if r0fid == r1fid: | |
cmpdict["rMatchCount"] += 1 | |
elif verbose: | |
print(r0fid, "!=", r1fid) | |
if hasattr(r0, "internal_coord") and r0.internal_coord is not None: | |
ric0 = r0.internal_coord | |
ric1 = r1.internal_coord | |
r0prev = sorted(ric.rbase for ric in ric0.rprev) | |
r1prev = sorted(ric.rbase for ric in ric1.rprev) | |
r0next = sorted(ric.rbase for ric in ric0.rnext) | |
r1next = sorted(ric.rbase for ric in ric1.rnext) | |
if r0prev != r1prev: | |
if verbose: | |
print(r0, "rprev error:", r0prev, "!=", r1prev) | |
cmpdict["rpnMismatchCount"] += 1 | |
if r0next != r1next: | |
if verbose: | |
print(r0, "rnext error", r0next, "!=", r1next) | |
cmpdict["rpnMismatchCount"] += 1 | |
if " " == r0id[0] and not (" " == r0.resname[0] or 2 == len(r0.resname)): | |
# skip water, DNA (' ' == [0] for pdb, 2 == len() for mmcif) | |
cmpdict["residues"] += 1 | |
longer = r0 if len(r0.child_dict) >= len(r1.child_dict) else r1 | |
for ak in longer.child_dict: | |
a0 = r0.child_dict.get(ak, None) | |
if a0 is None: | |
aknd = re.sub("D", "H", ak, count=1) | |
a0 = r0.child_dict.get(aknd, None) | |
a1 = r1.child_dict.get(ak, None) | |
if a1 is None: | |
aknd = re.sub("D", "H", ak, count=1) | |
a1 = r1.child_dict.get(aknd, None) | |
if ( | |
a0 is None | |
or a1 is None | |
or 0 == a0.is_disordered() == a1.is_disordered() | |
): | |
_cmp_atm(r0, r1, a0, a1, verbose, cmpdict, rtol=rtol, atol=atol) | |
elif 2 == a0.is_disordered() == a1.is_disordered(): | |
cmpdict["disAtmCount"] += 1 | |
for da0k in a0.child_dict: | |
_cmp_atm( | |
r0, | |
r1, | |
a0.child_dict.get(da0k, None), | |
a1.child_dict.get(da0k, None), | |
verbose, | |
cmpdict, | |
rtol=rtol, | |
atol=atol, | |
) | |
else: | |
if verbose: | |
print("disorder disagreement:", r0.get_full_id(), ak) | |
cmpdict["aCount"] += 1 | |
def compare_residues( | |
e0: Union[Structure, Model, Chain], | |
e1: Union[Structure, Model, Chain], | |
verbose: bool = False, | |
quick: bool = False, | |
rtol: float = None, | |
atol: float = None, | |
) -> Dict[str, Any]: | |
"""Compare full IDs and atom coordinates for 2 Biopython PDB entities. | |
Skip DNA and HETATMs. | |
:param Entity e0,e1: Biopython PDB Entity objects (S, M or C). | |
Structures, Models or Chains to be compared | |
:param bool verbose: | |
Whether to print mismatch info, default False | |
:param bool quick: default False. | |
Only check atomArrays are identical, aCoordMatchCount=0 if different | |
:param float rtol, atol: default 1e-03, 1e-03 or round to 3 places. | |
Numpy allclose parameters; default is to round atom coordinates to 3 | |
places and test equal. For 'quick' will use defaults above for | |
comparing atomArrays | |
:returns dict: | |
Result counts for Residues, Full ID match Residues, Atoms, | |
Full ID match atoms, and Coordinate match atoms; report string; | |
error status (bool) | |
""" | |
cmpdict: Dict[str, Any] = {} | |
cmpdict["chains"] = [] # list of chain IDs (union over both structures) | |
cmpdict["residues"] = 0 # count of not HETATM residues in longest chain | |
cmpdict["rCount"] = 0 # Biopython Residues (includes HETATMs, waters) | |
cmpdict["rMatchCount"] = 0 # full ID match Biopython Residues e0, e1 | |
cmpdict["rpnMismatchCount"] = 0 # res prev, next links not matched | |
cmpdict["aCount"] = 0 # Atoms including disordered in longest e0 or e1 | |
cmpdict["disAtmCount"] = 0 # disordered atoms in longest e0 or e1 | |
cmpdict["aCoordMatchCount"] = 0 # atoms with coordinates match e0, e1 | |
cmpdict["aFullIdMatchCount"] = 0 # atoms with full ID match e0, e1 | |
cmpdict["id0"] = e0.get_full_id() | |
cmpdict["id1"] = e1.get_full_id() | |
cmpdict["pass"] = None | |
cmpdict["report"] = None | |
if quick: | |
if isinstance(e0, Chain): | |
if ( | |
e0.internal_coord.atomArray is not None | |
and np.shape(e0.internal_coord.atomArray) | |
== np.shape(e1.internal_coord.atomArray) | |
and numpy.allclose( | |
e0.internal_coord.atomArray, | |
e1.internal_coord.atomArray, | |
rtol=1e-03 if rtol is None else rtol, | |
atol=1e-03 if atol is None else atol, | |
) | |
): | |
cmpdict["aCount"] = numpy.size(e0.internal_coord.atomArray, 0) | |
cmpdict["aCoordMatchCount"] = numpy.size(e0.internal_coord.atomArray, 0) | |
if cmpdict["aCoordMatchCount"] > 0: | |
cmpdict["pass"] = True | |
else: | |
cmpdict["pass"] = False | |
else: | |
cmpdict["aCount"] = ( | |
0 | |
if e0.internal_coord.atomArray is None | |
else numpy.size(e0.internal_coord.atomArray, 0) | |
) | |
cmpdict["pass"] = False | |
else: | |
cmpdict["pass"] = True | |
for c0, c1 in zip_longest(e0.get_chains(), e1.get_chains()): | |
if c0.internal_coord.atomArray is not None: | |
if numpy.allclose( | |
c0.internal_coord.atomArray, | |
c1.internal_coord.atomArray, | |
rtol=1e-03 if rtol is None else rtol, | |
atol=1e-03 if atol is None else atol, | |
): | |
cmpdict["aCoordMatchCount"] += numpy.size( | |
c0.internal_coord.atomArray, 0 | |
) | |
else: | |
cmpdict["pass"] = False | |
cmpdict["aCount"] += numpy.size(c0.internal_coord.atomArray, 0) | |
if cmpdict["aCoordMatchCount"] < cmpdict["aCount"]: | |
cmpdict["pass"] = False | |
else: | |
for r0, r1 in zip_longest(e0.get_residues(), e1.get_residues()): | |
if 2 == r0.is_disordered() == r1.is_disordered(): | |
for dr0, dr1 in zip_longest( | |
r0.child_dict.values(), r1.child_dict.values() | |
): | |
_cmp_res(dr0, dr1, verbose, cmpdict, rtol=rtol, atol=atol) | |
else: | |
_cmp_res(r0, r1, verbose, cmpdict, rtol=rtol, atol=atol) | |
if ( | |
cmpdict["rMatchCount"] == cmpdict["rCount"] | |
and cmpdict["aCoordMatchCount"] == cmpdict["aCount"] | |
and cmpdict["aFullIdMatchCount"] == cmpdict["aCount"] | |
and cmpdict["rpnMismatchCount"] == 0 | |
): | |
cmpdict["pass"] = True | |
else: | |
cmpdict["pass"] = False | |
rstr = ( | |
"{}:{} {} -- {} of {} residue IDs match; {} residues {} atom coords, " | |
"{} full IDs of {} atoms ({} disordered) match : {}".format( | |
cmpdict["id0"], | |
cmpdict["id1"], | |
cmpdict["chains"], | |
cmpdict["rMatchCount"], | |
cmpdict["rCount"], | |
cmpdict["residues"], | |
cmpdict["aCoordMatchCount"], | |
cmpdict["aFullIdMatchCount"], | |
cmpdict["aCount"], | |
cmpdict["disAtmCount"], | |
"ERROR" if not cmpdict["pass"] else "ALL OK", | |
) | |
) | |
if not cmpdict["pass"]: | |
if cmpdict["rMatchCount"] != cmpdict["rCount"]: | |
rstr += " -RESIDUE IDS-" | |
if cmpdict["aCoordMatchCount"] != cmpdict["aFullIdMatchCount"]: | |
rstr += " -COORDINATES-" | |
if cmpdict["aFullIdMatchCount"] != cmpdict["aCount"]: | |
rstr += " -ATOM IDS-" | |
cmpdict["report"] = rstr | |
return cmpdict | |
def write_PDB( | |
entity: Structure, file: str, pdbid: str = None, chainid: str = None | |
) -> None: | |
"""Write PDB file with HEADER and TITLE if available.""" | |
enumerate_atoms(entity) | |
with as_handle(file, "w") as fp: | |
try: | |
if hasattr(entity, "header"): | |
if not pdbid: | |
pdbid = entity.header.get("idcode", None) | |
hdr = entity.header.get("head", None) | |
dd = pdb_date(entity.header.get("deposition_date", None)) | |
if hdr: | |
fp.write( | |
("HEADER {:40}{:8} {:4}\n").format( | |
hdr.upper(), (dd or ""), (pdbid or "") | |
) | |
) | |
nam = entity.header.get("name", None) | |
if nam: | |
fp.write("TITLE " + nam.upper() + "\n") | |
io = PDBIO() | |
io.set_structure(entity) | |
io.save(fp, preserve_atom_numbering=True) | |
except KeyError: | |
raise Exception( | |
"write_PDB: argument is not a Biopython PDB Entity " + str(entity) | |
) | |