Spaces:
No application file
No application file
# Copyright (C) 2002, Thomas Hamelryck ([email protected]) | |
# This code is part of the Biopython distribution and governed by its | |
# license. Please see the LICENSE file that should have been included | |
# as part of this package. | |
"""mmCIF parsers.""" | |
import numpy | |
import warnings | |
from Bio.File import as_handle | |
from Bio.PDB.MMCIF2Dict import MMCIF2Dict | |
from Bio.PDB.StructureBuilder import StructureBuilder | |
from Bio.PDB.PDBExceptions import PDBConstructionException | |
from Bio.PDB.PDBExceptions import PDBConstructionWarning | |
class MMCIFParser: | |
"""Parse a mmCIF file and return a Structure object.""" | |
def __init__( | |
self, structure_builder=None, auth_chains=True, auth_residues=True, QUIET=False | |
): | |
"""Create a PDBParser object. | |
The mmCIF parser calls a number of standard methods in an aggregated | |
StructureBuilder object. Normally this object is instantiated by the | |
MMCIParser object itself, but if the user provides his/her own | |
StructureBuilder object, the latter is used instead. | |
Arguments: | |
- structure_builder - an optional user implemented StructureBuilder class. | |
- auth_chains - True by default. If true, use the author chain IDs. | |
If false, use the re-assigned mmCIF chain IDs. | |
- auth_residues - True by default. If true, use the author residue numbering. | |
If false, use the mmCIF "label" residue numbering, which has no insertion | |
codes, and strictly increments residue numbers. | |
NOTE: Non-polymers such as water don't have a "label" residue number, | |
and will be skipped. | |
- QUIET - Evaluated as a Boolean. If true, warnings issued in constructing | |
the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. | |
These warnings might be indicative of problems in the mmCIF file! | |
""" | |
if structure_builder is not None: | |
self._structure_builder = structure_builder | |
else: | |
self._structure_builder = StructureBuilder() | |
self.header = None | |
# self.trailer = None | |
self.line_counter = 0 | |
self.build_structure = None | |
self.auth_chains = bool(auth_chains) | |
self.auth_residues = bool(auth_residues) | |
self.QUIET = bool(QUIET) | |
# Public methods | |
def get_structure(self, structure_id, filename): | |
"""Return the structure. | |
Arguments: | |
- structure_id - string, the id that will be used for the structure | |
- filename - name of mmCIF file, OR an open text mode file handle | |
""" | |
with warnings.catch_warnings(): | |
if self.QUIET: | |
warnings.filterwarnings("ignore", category=PDBConstructionWarning) | |
self._mmcif_dict = MMCIF2Dict(filename) | |
self._build_structure(structure_id) | |
self._structure_builder.set_header(self._get_header()) | |
return self._structure_builder.get_structure() | |
# Private methods | |
def _mmcif_get(self, key, dict, deflt): | |
if key in dict: | |
rslt = dict[key][0] | |
if "?" != rslt: | |
return rslt | |
return deflt | |
def _update_header_entry(self, target_key, keys): | |
md = self._mmcif_dict | |
for key in keys: | |
val = md.get(key) | |
try: | |
item = val[0] | |
except (TypeError, IndexError): | |
continue | |
if item != "?": | |
self.header[target_key] = item | |
break | |
def _get_header(self): | |
self.header = { | |
"name": "", | |
"head": "", | |
"idcode": "", | |
"deposition_date": "", | |
"structure_method": "", | |
"resolution": None, | |
} | |
self._update_header_entry( | |
"idcode", ["_entry_id", "_exptl.entry_id", "_struct.entry_id"] | |
) | |
self._update_header_entry("name", ["_struct.title"]) | |
self._update_header_entry( | |
"head", ["_struct_keywords.pdbx_keywords", "_struct_keywords.text"] | |
) | |
self._update_header_entry( | |
"deposition_date", ["_pdbx_database_status.recvd_initial_deposition_date"] | |
) | |
self._update_header_entry("structure_method", ["_exptl.method"]) | |
self._update_header_entry( | |
"resolution", | |
[ | |
"_refine.ls_d_res_high", | |
"_refine_hist.d_res_high", | |
"_em_3d_reconstruction.resolution", | |
], | |
) | |
if self.header["resolution"] is not None: | |
try: | |
self.header["resolution"] = float(self.header["resolution"]) | |
except ValueError: | |
self.header["resolution"] = None | |
return self.header | |
def _build_structure(self, structure_id): | |
# two special chars as placeholders in the mmCIF format | |
# for item values that cannot be explicitly assigned | |
# see: pdbx/mmcif syntax web page | |
_unassigned = {".", "?"} | |
mmcif_dict = self._mmcif_dict | |
atom_serial_list = mmcif_dict["_atom_site.id"] | |
atom_id_list = mmcif_dict["_atom_site.label_atom_id"] | |
residue_id_list = mmcif_dict["_atom_site.label_comp_id"] | |
try: | |
element_list = mmcif_dict["_atom_site.type_symbol"] | |
except KeyError: | |
element_list = None | |
if self.auth_chains: | |
chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] | |
else: | |
chain_id_list = mmcif_dict["_atom_site.label_asym_id"] | |
x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] | |
y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] | |
z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] | |
alt_list = mmcif_dict["_atom_site.label_alt_id"] | |
icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] | |
b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] | |
occupancy_list = mmcif_dict["_atom_site.occupancy"] | |
fieldname_list = mmcif_dict["_atom_site.group_PDB"] | |
try: | |
serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]] | |
except KeyError: | |
# No model number column | |
serial_list = None | |
except ValueError: | |
# Invalid model number (malformed file) | |
raise PDBConstructionException("Invalid model number") from None | |
try: | |
aniso_u11 = mmcif_dict["_atom_site_anisotrop.U[1][1]"] | |
aniso_u12 = mmcif_dict["_atom_site_anisotrop.U[1][2]"] | |
aniso_u13 = mmcif_dict["_atom_site_anisotrop.U[1][3]"] | |
aniso_u22 = mmcif_dict["_atom_site_anisotrop.U[2][2]"] | |
aniso_u23 = mmcif_dict["_atom_site_anisotrop.U[2][3]"] | |
aniso_u33 = mmcif_dict["_atom_site_anisotrop.U[3][3]"] | |
aniso_flag = 1 | |
except KeyError: | |
# no anisotropic B factors | |
aniso_flag = 0 | |
if self.auth_residues: | |
# if auth_seq_id is present, we use this. | |
# Otherwise label_seq_id is used. | |
if "_atom_site.auth_seq_id" in mmcif_dict: | |
seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] | |
else: | |
seq_id_list = mmcif_dict["_atom_site.label_seq_id"] | |
else: | |
seq_id_list = mmcif_dict["_atom_site.label_seq_id"] | |
# Now loop over atoms and build the structure | |
current_chain_id = None | |
current_residue_id = None | |
current_resname = None | |
structure_builder = self._structure_builder | |
structure_builder.init_structure(structure_id) | |
structure_builder.init_seg(" ") | |
# Historically, Biopython PDB parser uses model_id to mean array index | |
# so serial_id means the Model ID specified in the file | |
current_model_id = -1 | |
current_serial_id = -1 | |
for i in range(0, len(atom_id_list)): | |
# set the line_counter for 'ATOM' lines only and not | |
# as a global line counter found in the PDBParser() | |
structure_builder.set_line_counter(i) | |
# Try coercing serial to int, for compatibility with PDBParser | |
# But do not quit if it fails. mmCIF format specs allow strings. | |
try: | |
serial = int(atom_serial_list[i]) | |
except ValueError: | |
serial = atom_serial_list[i] | |
warnings.warn( | |
"PDBConstructionWarning: " | |
"Some atom serial numbers are not numerical", | |
PDBConstructionWarning, | |
) | |
x = x_list[i] | |
y = y_list[i] | |
z = z_list[i] | |
resname = residue_id_list[i] | |
chainid = chain_id_list[i] | |
altloc = alt_list[i] | |
if altloc in _unassigned: | |
altloc = " " | |
resseq = seq_id_list[i] | |
if resseq == ".": | |
# Non-existing residue ID | |
try: | |
msg_resseq = mmcif_dict["_atom_site.auth_seq_id"][i] | |
msg = "Non-existing residue ID in chain '{}', residue '{}'".format( | |
chainid, msg_resseq | |
) | |
except (KeyError, IndexError): | |
msg = f"Non-existing residue ID in chain '{chainid}'" | |
warnings.warn( | |
"PDBConstructionWarning: ", | |
msg, | |
PDBConstructionWarning, | |
) | |
continue | |
int_resseq = int(resseq) | |
icode = icode_list[i] | |
if icode in _unassigned: | |
icode = " " | |
name = atom_id_list[i] | |
# occupancy & B factor | |
try: | |
tempfactor = float(b_factor_list[i]) | |
except ValueError: | |
raise PDBConstructionException("Invalid or missing B factor") from None | |
try: | |
occupancy = float(occupancy_list[i]) | |
except ValueError: | |
raise PDBConstructionException("Invalid or missing occupancy") from None | |
fieldname = fieldname_list[i] | |
if fieldname == "HETATM": | |
if resname == "HOH" or resname == "WAT": | |
hetatm_flag = "W" | |
else: | |
hetatm_flag = "H" | |
else: | |
hetatm_flag = " " | |
resseq = (hetatm_flag, int_resseq, icode) | |
if serial_list is not None: | |
# model column exists; use it | |
serial_id = serial_list[i] | |
if current_serial_id != serial_id: | |
# if serial changes, update it and start new model | |
current_serial_id = serial_id | |
current_model_id += 1 | |
structure_builder.init_model(current_model_id, current_serial_id) | |
current_chain_id = None | |
current_residue_id = None | |
current_resname = None | |
else: | |
# no explicit model column; initialize single model | |
structure_builder.init_model(current_model_id) | |
if current_chain_id != chainid: | |
current_chain_id = chainid | |
structure_builder.init_chain(current_chain_id) | |
current_residue_id = None | |
current_resname = None | |
if current_residue_id != resseq or current_resname != resname: | |
current_residue_id = resseq | |
current_resname = resname | |
structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) | |
coord = numpy.array((x, y, z), "f") | |
element = element_list[i].upper() if element_list else None | |
structure_builder.init_atom( | |
name, | |
coord, | |
tempfactor, | |
occupancy, | |
altloc, | |
name, | |
serial_number=serial, | |
element=element, | |
) | |
if aniso_flag == 1 and i < len(aniso_u11): | |
u = ( | |
aniso_u11[i], | |
aniso_u12[i], | |
aniso_u13[i], | |
aniso_u22[i], | |
aniso_u23[i], | |
aniso_u33[i], | |
) | |
mapped_anisou = [float(_) for _ in u] | |
anisou_array = numpy.array(mapped_anisou, "f") | |
structure_builder.set_anisou(anisou_array) | |
# Now try to set the cell | |
try: | |
a = float(mmcif_dict["_cell.length_a"][0]) | |
b = float(mmcif_dict["_cell.length_b"][0]) | |
c = float(mmcif_dict["_cell.length_c"][0]) | |
alpha = float(mmcif_dict["_cell.angle_alpha"][0]) | |
beta = float(mmcif_dict["_cell.angle_beta"][0]) | |
gamma = float(mmcif_dict["_cell.angle_gamma"][0]) | |
cell = numpy.array((a, b, c, alpha, beta, gamma), "f") | |
spacegroup = mmcif_dict["_symmetry.space_group_name_H-M"][0] | |
spacegroup = spacegroup[1:-1] # get rid of quotes!! | |
if spacegroup is None: | |
raise Exception | |
structure_builder.set_symmetry(spacegroup, cell) | |
except Exception: | |
pass # no cell found, so just ignore | |
class FastMMCIFParser: | |
"""Parse an MMCIF file and return a Structure object.""" | |
def __init__( | |
self, structure_builder=None, auth_chains=True, auth_residues=True, QUIET=False | |
): | |
"""Create a FastMMCIFParser object. | |
The mmCIF parser calls a number of standard methods in an aggregated | |
StructureBuilder object. Normally this object is instantiated by the | |
parser object itself, but if the user provides his/her own | |
StructureBuilder object, the latter is used instead. | |
The main difference between this class and the regular MMCIFParser is | |
that only 'ATOM' and 'HETATM' lines are parsed here. Use if you are | |
interested only in coordinate information. | |
Arguments: | |
- structure_builder - an optional user implemented StructureBuilder class. | |
- auth_chains - True by default. If true, use the author chain IDs. | |
If false, use the re-assigned mmCIF chain IDs. | |
- auth_residues - True by default. If true, use the author residue numbering. | |
If false, use the mmCIF "label" residue numbering, which has no insertion | |
codes, and strictly increments residue numbers. | |
NOTE: Non-polymers such as water don't have a "label" residue number, | |
and will be skipped. | |
- QUIET - Evaluated as a Boolean. If true, warnings issued in constructing | |
the SMCRA data will be suppressed. If false (DEFAULT), they will be shown. | |
These warnings might be indicative of problems in the mmCIF file! | |
""" | |
if structure_builder is not None: | |
self._structure_builder = structure_builder | |
else: | |
self._structure_builder = StructureBuilder() | |
self.line_counter = 0 | |
self.build_structure = None | |
self.auth_chains = bool(auth_chains) | |
self.auth_residues = bool(auth_residues) | |
self.QUIET = bool(QUIET) | |
# Public methods | |
def get_structure(self, structure_id, filename): | |
"""Return the structure. | |
Arguments: | |
- structure_id - string, the id that will be used for the structure | |
- filename - name of the mmCIF file OR an open filehandle | |
""" | |
with warnings.catch_warnings(): | |
if self.QUIET: | |
warnings.filterwarnings("ignore", category=PDBConstructionWarning) | |
with as_handle(filename) as handle: | |
self._build_structure(structure_id, handle) | |
return self._structure_builder.get_structure() | |
# Private methods | |
def _build_structure(self, structure_id, filehandle): | |
# two special chars as placeholders in the mmCIF format | |
# for item values that cannot be explicitly assigned | |
# see: pdbx/mmcif syntax web page | |
_unassigned = {".", "?"} | |
# Read only _atom_site. and atom_site_anisotrop entries | |
read_atom, read_aniso = False, False | |
_fields, _records = [], [] | |
_anisof, _anisors = [], [] | |
for line in filehandle: | |
if line.startswith("_atom_site."): | |
read_atom = True | |
_fields.append(line.strip()) | |
elif line.startswith("_atom_site_anisotrop."): | |
read_aniso = True | |
_anisof.append(line.strip()) | |
elif read_atom and line.startswith("#"): | |
read_atom = False | |
elif read_aniso and line.startswith("#"): | |
read_aniso = False | |
elif read_atom: | |
_records.append(line.strip()) | |
elif read_aniso: | |
_anisors.append(line.strip()) | |
# Dumping the shlex module here since this particular | |
# category should be rather straightforward. | |
# Quite a performance boost.. | |
_record_tbl = zip(*map(str.split, _records)) | |
_anisob_tbl = zip(*map(str.split, _anisors)) | |
mmcif_dict = dict(zip(_fields, _record_tbl)) | |
mmcif_dict.update(dict(zip(_anisof, _anisob_tbl))) | |
# Build structure object | |
atom_serial_list = mmcif_dict["_atom_site.id"] | |
atom_id_list = mmcif_dict["_atom_site.label_atom_id"] | |
residue_id_list = mmcif_dict["_atom_site.label_comp_id"] | |
try: | |
element_list = mmcif_dict["_atom_site.type_symbol"] | |
except KeyError: | |
element_list = None | |
if self.auth_chains: | |
chain_id_list = mmcif_dict["_atom_site.auth_asym_id"] | |
else: | |
chain_id_list = mmcif_dict["_atom_site.label_asym_id"] | |
x_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_x"]] | |
y_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_y"]] | |
z_list = [float(x) for x in mmcif_dict["_atom_site.Cartn_z"]] | |
alt_list = mmcif_dict["_atom_site.label_alt_id"] | |
icode_list = mmcif_dict["_atom_site.pdbx_PDB_ins_code"] | |
b_factor_list = mmcif_dict["_atom_site.B_iso_or_equiv"] | |
occupancy_list = mmcif_dict["_atom_site.occupancy"] | |
fieldname_list = mmcif_dict["_atom_site.group_PDB"] | |
try: | |
serial_list = [int(n) for n in mmcif_dict["_atom_site.pdbx_PDB_model_num"]] | |
except KeyError: | |
# No model number column | |
serial_list = None | |
except ValueError: | |
# Invalid model number (malformed file) | |
raise PDBConstructionException("Invalid model number") from None | |
try: | |
aniso_u11 = mmcif_dict["_atom_site_anisotrop.U[1][1]"] | |
aniso_u12 = mmcif_dict["_atom_site_anisotrop.U[1][2]"] | |
aniso_u13 = mmcif_dict["_atom_site_anisotrop.U[1][3]"] | |
aniso_u22 = mmcif_dict["_atom_site_anisotrop.U[2][2]"] | |
aniso_u23 = mmcif_dict["_atom_site_anisotrop.U[2][3]"] | |
aniso_u33 = mmcif_dict["_atom_site_anisotrop.U[3][3]"] | |
aniso_flag = 1 | |
except KeyError: | |
# no anisotropic B factors | |
aniso_flag = 0 | |
if self.auth_residues: | |
# if auth_seq_id is present, we use this. | |
# Otherwise label_seq_id is used. | |
if "_atom_site.auth_seq_id" in mmcif_dict: | |
seq_id_list = mmcif_dict["_atom_site.auth_seq_id"] | |
else: | |
seq_id_list = mmcif_dict["_atom_site.label_seq_id"] | |
else: | |
seq_id_list = mmcif_dict["_atom_site.label_seq_id"] | |
# Now loop over atoms and build the structure | |
current_chain_id = None | |
current_residue_id = None | |
current_resname = None | |
structure_builder = self._structure_builder | |
structure_builder.init_structure(structure_id) | |
structure_builder.init_seg(" ") | |
# Historically, Biopython PDB parser uses model_id to mean array index | |
# so serial_id means the Model ID specified in the file | |
current_model_id = -1 | |
current_serial_id = -1 | |
for i in range(0, len(atom_id_list)): | |
# set the line_counter for 'ATOM' lines only and not | |
# as a global line counter found in the PDBParser() | |
structure_builder.set_line_counter(i) | |
serial = atom_serial_list[i] | |
x = x_list[i] | |
y = y_list[i] | |
z = z_list[i] | |
resname = residue_id_list[i] | |
chainid = chain_id_list[i] | |
altloc = alt_list[i] | |
if altloc in _unassigned: | |
altloc = " " | |
resseq = seq_id_list[i] | |
if resseq == ".": | |
# Non-existing residue ID | |
try: | |
msg_resseq = mmcif_dict["_atom_site.auth_seq_id"][i] | |
msg = "Non-existing residue ID in chain '{}', residue '{}'".format( | |
chainid, msg_resseq | |
) | |
except (KeyError, IndexError): | |
msg = f"Non-existing residue ID in chain '{chainid}'" | |
warnings.warn( | |
"PDBConstructionWarning: ", | |
msg, | |
PDBConstructionWarning, | |
) | |
continue | |
int_resseq = int(resseq) | |
icode = icode_list[i] | |
if icode in _unassigned: | |
icode = " " | |
# Remove occasional " from quoted atom names (e.g. xNA) | |
name = atom_id_list[i].strip('"') | |
# occupancy & B factor | |
try: | |
tempfactor = float(b_factor_list[i]) | |
except ValueError: | |
raise PDBConstructionException("Invalid or missing B factor") from None | |
try: | |
occupancy = float(occupancy_list[i]) | |
except ValueError: | |
raise PDBConstructionException("Invalid or missing occupancy") from None | |
fieldname = fieldname_list[i] | |
if fieldname == "HETATM": | |
hetatm_flag = "H" | |
else: | |
hetatm_flag = " " | |
resseq = (hetatm_flag, int_resseq, icode) | |
if serial_list is not None: | |
# model column exists; use it | |
serial_id = serial_list[i] | |
if current_serial_id != serial_id: | |
# if serial changes, update it and start new model | |
current_serial_id = serial_id | |
current_model_id += 1 | |
structure_builder.init_model(current_model_id, current_serial_id) | |
current_chain_id = None | |
current_residue_id = None | |
current_resname = None | |
else: | |
# no explicit model column; initialize single model | |
structure_builder.init_model(current_model_id) | |
if current_chain_id != chainid: | |
current_chain_id = chainid | |
structure_builder.init_chain(current_chain_id) | |
current_residue_id = None | |
current_resname = None | |
if current_residue_id != resseq or current_resname != resname: | |
current_residue_id = resseq | |
current_resname = resname | |
structure_builder.init_residue(resname, hetatm_flag, int_resseq, icode) | |
coord = numpy.array((x, y, z), "f") | |
element = element_list[i] if element_list else None | |
structure_builder.init_atom( | |
name, | |
coord, | |
tempfactor, | |
occupancy, | |
altloc, | |
name, | |
serial_number=serial, | |
element=element, | |
) | |
if aniso_flag == 1 and i < len(aniso_u11): | |
u = ( | |
aniso_u11[i], | |
aniso_u12[i], | |
aniso_u13[i], | |
aniso_u22[i], | |
aniso_u23[i], | |
aniso_u33[i], | |
) | |
mapped_anisou = [float(_) for _ in u] | |
anisou_array = numpy.array(mapped_anisou, "f") | |
structure_builder.set_anisou(anisou_array) | |