Spaces:
No application file
No application file
# Copyright (C) 2002, Thomas Hamelryck ([email protected]) | |
# This code is part of the Biopython distribution and governed by its | |
# license. Please see the LICENSE file that should have been included | |
# as part of this package. | |
"""Parser for PDB files.""" | |
import warnings | |
try: | |
import numpy | |
except ImportError: | |
from Bio import MissingPythonDependencyError | |
raise MissingPythonDependencyError( | |
"Install NumPy if you want to use the PDB parser." | |
) from None | |
from Bio.File import as_handle | |
from Bio.PDB.PDBExceptions import PDBConstructionException | |
from Bio.PDB.PDBExceptions import PDBConstructionWarning | |
from Bio.PDB.StructureBuilder import StructureBuilder | |
from Bio.PDB.parse_pdb_header import _parse_pdb_header_list | |
# If PDB spec says "COLUMNS 18-20" this means line[17:20] | |
class PDBParser: | |
"""Parse a PDB file and return a Structure object.""" | |
def __init__( | |
self, | |
PERMISSIVE=True, | |
get_header=False, | |
structure_builder=None, | |
QUIET=False, | |
is_pqr=False, | |
): | |
"""Create a PDBParser object. | |
The PDB parser call a number of standard methods in an aggregated | |
StructureBuilder object. Normally this object is instantiated by the | |
PDBParser object itself, but if the user provides his/her own | |
StructureBuilder object, the latter is used instead. | |
Arguments: | |
- PERMISSIVE - Evaluated as a Boolean. If false, exceptions in | |
constructing the SMCRA data structure are fatal. If true (DEFAULT), | |
the exceptions are caught, but some residues or atoms will be missing. | |
THESE EXCEPTIONS ARE DUE TO PROBLEMS IN THE PDB FILE!. | |
- get_header - unused argument kept for historical compatibility. | |
- structure_builder - an optional user implemented StructureBuilder class. | |
- QUIET - Evaluated as a Boolean. If true, warnings issued in constructing | |
the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. | |
These warnings might be indicative of problems in the PDB file! | |
- is_pqr - Evaluated as a Boolean. Specifies the type of file to be parsed. | |
If false (DEFAULT) a .pdb file format is assumed. Set it to true if you | |
want to parse a .pqr file instead. | |
""" | |
# get_header is not used but is left in for API compatibility | |
if structure_builder is not None: | |
self.structure_builder = structure_builder | |
else: | |
self.structure_builder = StructureBuilder() | |
self.header = None | |
self.trailer = None | |
self.line_counter = 0 | |
self.PERMISSIVE = bool(PERMISSIVE) | |
self.QUIET = bool(QUIET) | |
self.is_pqr = bool(is_pqr) | |
# Public methods | |
def get_structure(self, id, file): | |
"""Return the structure. | |
Arguments: | |
- id - string, the id that will be used for the structure | |
- file - name of the PDB file OR an open filehandle | |
""" | |
with warnings.catch_warnings(): | |
if self.QUIET: | |
warnings.filterwarnings("ignore", category=PDBConstructionWarning) | |
self.header = None | |
self.trailer = None | |
# Make a StructureBuilder instance (pass id of structure as parameter) | |
self.structure_builder.init_structure(id) | |
with as_handle(file) as handle: | |
lines = handle.readlines() | |
if not lines: | |
raise ValueError("Empty file.") | |
self._parse(lines) | |
self.structure_builder.set_header(self.header) | |
# Return the Structure instance | |
structure = self.structure_builder.get_structure() | |
return structure | |
def get_header(self): | |
"""Return the header.""" | |
return self.header | |
def get_trailer(self): | |
"""Return the trailer.""" | |
return self.trailer | |
# Private methods | |
def _parse(self, header_coords_trailer): | |
"""Parse the PDB file (PRIVATE).""" | |
# Extract the header; return the rest of the file | |
self.header, coords_trailer = self._get_header(header_coords_trailer) | |
# Parse the atomic data; return the PDB file trailer | |
self.trailer = self._parse_coordinates(coords_trailer) | |
def _get_header(self, header_coords_trailer): | |
"""Get the header of the PDB file, return the rest (PRIVATE).""" | |
structure_builder = self.structure_builder | |
i = 0 | |
for i in range(0, len(header_coords_trailer)): | |
structure_builder.set_line_counter(i + 1) | |
line = header_coords_trailer[i] | |
record_type = line[0:6] | |
if record_type in ("ATOM ", "HETATM", "MODEL "): | |
break | |
header = header_coords_trailer[0:i] | |
# Return the rest of the coords+trailer for further processing | |
self.line_counter = i | |
coords_trailer = header_coords_trailer[i:] | |
header_dict = _parse_pdb_header_list(header) | |
return header_dict, coords_trailer | |
def _parse_coordinates(self, coords_trailer): | |
"""Parse the atomic data in the PDB file (PRIVATE).""" | |
allowed_records = { | |
"ATOM ", | |
"HETATM", | |
"MODEL ", | |
"ENDMDL", | |
"TER ", | |
"ANISOU", | |
# These are older 2.3 format specs: | |
"SIGATM", | |
"SIGUIJ", | |
# bookkeeping records after coordinates: | |
"MASTER", | |
} | |
local_line_counter = 0 | |
structure_builder = self.structure_builder | |
current_model_id = 0 | |
# Flag we have an open model | |
model_open = 0 | |
current_chain_id = None | |
current_segid = None | |
current_residue_id = None | |
current_resname = None | |
for i in range(0, len(coords_trailer)): | |
line = coords_trailer[i].rstrip("\n") | |
record_type = line[0:6] | |
global_line_counter = self.line_counter + local_line_counter + 1 | |
structure_builder.set_line_counter(global_line_counter) | |
if not line.strip(): | |
continue # skip empty lines | |
elif record_type == "ATOM " or record_type == "HETATM": | |
# Initialize the Model - there was no explicit MODEL record | |
if not model_open: | |
structure_builder.init_model(current_model_id) | |
current_model_id += 1 | |
model_open = 1 | |
fullname = line[12:16] | |
# get rid of whitespace in atom names | |
split_list = fullname.split() | |
if len(split_list) != 1: | |
# atom name has internal spaces, e.g. " N B ", so | |
# we do not strip spaces | |
name = fullname | |
else: | |
# atom name is like " CA ", so we can strip spaces | |
name = split_list[0] | |
altloc = line[16] | |
resname = line[17:20].strip() | |
chainid = line[21] | |
try: | |
serial_number = int(line[6:11]) | |
except Exception: | |
serial_number = 0 | |
resseq = int(line[22:26].split()[0]) # sequence identifier | |
icode = line[26] # insertion code | |
if record_type == "HETATM": # hetero atom flag | |
if resname == "HOH" or resname == "WAT": | |
hetero_flag = "W" | |
else: | |
hetero_flag = "H" | |
else: | |
hetero_flag = " " | |
residue_id = (hetero_flag, resseq, icode) | |
# atomic coordinates | |
try: | |
x = float(line[30:38]) | |
y = float(line[38:46]) | |
z = float(line[46:54]) | |
except Exception: | |
# Should we allow parsing to continue in permissive mode? | |
# If so, what coordinates should we default to? Easier to abort! | |
raise PDBConstructionException( | |
"Invalid or missing coordinate(s) at line %i." | |
% global_line_counter | |
) from None | |
coord = numpy.array((x, y, z), "f") | |
# occupancy & B factor | |
if not self.is_pqr: | |
try: | |
occupancy = float(line[54:60]) | |
except Exception: | |
self._handle_PDB_exception( | |
"Invalid or missing occupancy", global_line_counter | |
) | |
occupancy = None # Rather than arbitrary zero or one | |
if occupancy is not None and occupancy < 0: | |
# TODO - Should this be an error in strict mode? | |
# self._handle_PDB_exception("Negative occupancy", | |
# global_line_counter) | |
# This uses fixed text so the warning occurs once only: | |
warnings.warn( | |
"Negative occupancy in one or more atoms", | |
PDBConstructionWarning, | |
) | |
try: | |
bfactor = float(line[60:66]) | |
except Exception: | |
self._handle_PDB_exception( | |
"Invalid or missing B factor", global_line_counter | |
) | |
bfactor = 0.0 # PDB uses a default of zero if missing | |
elif self.is_pqr: | |
# Attempt to parse charge and radius fields | |
try: | |
pqr_charge = float(line[54:62]) | |
except Exception: | |
self._handle_PDB_exception( | |
"Invalid or missing charge", global_line_counter | |
) | |
pqr_charge = None # Rather than arbitrary zero or one | |
try: | |
radius = float(line[62:70]) | |
except Exception: | |
self._handle_PDB_exception( | |
"Invalid or missing radius", global_line_counter | |
) | |
radius = None | |
if radius is not None and radius < 0: | |
# In permissive mode raise fatal exception. | |
message = "Negative atom radius" | |
self._handle_PDB_exception(message, global_line_counter) | |
radius = None | |
segid = line[72:76] | |
element = line[76:78].strip().upper() | |
if current_segid != segid: | |
current_segid = segid | |
structure_builder.init_seg(current_segid) | |
if current_chain_id != chainid: | |
current_chain_id = chainid | |
structure_builder.init_chain(current_chain_id) | |
current_residue_id = residue_id | |
current_resname = resname | |
try: | |
structure_builder.init_residue( | |
resname, hetero_flag, resseq, icode | |
) | |
except PDBConstructionException as message: | |
self._handle_PDB_exception(message, global_line_counter) | |
elif current_residue_id != residue_id or current_resname != resname: | |
current_residue_id = residue_id | |
current_resname = resname | |
try: | |
structure_builder.init_residue( | |
resname, hetero_flag, resseq, icode | |
) | |
except PDBConstructionException as message: | |
self._handle_PDB_exception(message, global_line_counter) | |
if not self.is_pqr: | |
# init atom with pdb fields | |
try: | |
structure_builder.init_atom( | |
name, | |
coord, | |
bfactor, | |
occupancy, | |
altloc, | |
fullname, | |
serial_number, | |
element, | |
) | |
except PDBConstructionException as message: | |
self._handle_PDB_exception(message, global_line_counter) | |
elif self.is_pqr: | |
try: | |
structure_builder.init_atom( | |
name, | |
coord, | |
pqr_charge, | |
radius, | |
altloc, | |
fullname, | |
serial_number, | |
element, | |
pqr_charge, | |
radius, | |
self.is_pqr, | |
) | |
except PDBConstructionException as message: | |
self._handle_PDB_exception(message, global_line_counter) | |
elif record_type == "ANISOU": | |
anisou = [ | |
float(x) | |
for x in ( | |
line[28:35], | |
line[35:42], | |
line[43:49], | |
line[49:56], | |
line[56:63], | |
line[63:70], | |
) | |
] | |
# U's are scaled by 10^4 | |
anisou_array = (numpy.array(anisou, "f") / 10000.0).astype("f") | |
structure_builder.set_anisou(anisou_array) | |
elif record_type == "MODEL ": | |
try: | |
serial_num = int(line[10:14]) | |
except Exception: | |
self._handle_PDB_exception( | |
"Invalid or missing model serial number", global_line_counter | |
) | |
serial_num = 0 | |
structure_builder.init_model(current_model_id, serial_num) | |
current_model_id += 1 | |
model_open = 1 | |
current_chain_id = None | |
current_residue_id = None | |
elif record_type == "END " or record_type == "CONECT": | |
# End of atomic data, return the trailer | |
self.line_counter += local_line_counter | |
return coords_trailer[local_line_counter:] | |
elif record_type == "ENDMDL": | |
model_open = 0 | |
current_chain_id = None | |
current_residue_id = None | |
elif record_type == "SIGUIJ": | |
# standard deviation of anisotropic B factor | |
siguij = [ | |
float(x) | |
for x in ( | |
line[28:35], | |
line[35:42], | |
line[42:49], | |
line[49:56], | |
line[56:63], | |
line[63:70], | |
) | |
] | |
# U sigma's are scaled by 10^4 | |
siguij_array = (numpy.array(siguij, "f") / 10000.0).astype("f") | |
structure_builder.set_siguij(siguij_array) | |
elif record_type == "SIGATM": | |
# standard deviation of atomic positions | |
sigatm = [ | |
float(x) | |
for x in ( | |
line[30:38], | |
line[38:46], | |
line[46:54], | |
line[54:60], | |
line[60:66], | |
) | |
] | |
sigatm_array = numpy.array(sigatm, "f") | |
structure_builder.set_sigatm(sigatm_array) | |
elif record_type not in allowed_records: | |
warnings.warn( | |
"Ignoring unrecognized record '{}' at line {}".format( | |
record_type, global_line_counter | |
), | |
PDBConstructionWarning, | |
) | |
local_line_counter += 1 | |
# EOF (does not end in END or CONECT) | |
self.line_counter = self.line_counter + local_line_counter | |
return [] | |
def _handle_PDB_exception(self, message, line_counter): | |
"""Handle exception (PRIVATE). | |
This method catches an exception that occurs in the StructureBuilder | |
object (if PERMISSIVE), or raises it again, this time adding the | |
PDB line number to the error message. | |
""" | |
message = "%s at line %i." % (message, line_counter) | |
if self.PERMISSIVE: | |
# just print a warning - some residues/atoms may be missing | |
warnings.warn( | |
"PDBConstructionException: %s\n" | |
"Exception ignored.\n" | |
"Some atoms or residues may be missing in the data structure." | |
% message, | |
PDBConstructionWarning, | |
) | |
else: | |
# exceptions are fatal - raise again with new message (including line nr) | |
raise PDBConstructionException(message) from None | |