Spaces:
No application file
No application file
File size: 15,410 Bytes
b7731cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 |
# Copyright 2017 Joe Greener. All rights reserved.
#
# This file is part of the Biopython distribution and governed by your
# choice of the "Biopython License Agreement" or the "BSD 3-Clause License".
# Please see the LICENSE file that should have been included as part of this
# package.
"""Write an mmCIF file.
See https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for syntax.
"""
import re
from collections import defaultdict
from Bio.PDB.StructureBuilder import StructureBuilder
from Bio.PDB.PDBIO import Select, StructureIO
# If certain entries should have a certain order of keys, that is specified here
mmcif_order = {
"_atom_site": [
"group_PDB",
"id",
"type_symbol",
"label_atom_id",
"label_alt_id",
"label_comp_id",
"label_asym_id",
"label_entity_id",
"label_seq_id",
"pdbx_PDB_ins_code",
"Cartn_x",
"Cartn_y",
"Cartn_z",
"occupancy",
"B_iso_or_equiv",
"pdbx_formal_charge",
"auth_seq_id",
"auth_comp_id",
"auth_asym_id",
"auth_atom_id",
"pdbx_PDB_model_num",
]
}
_select = Select()
class MMCIFIO(StructureIO):
"""Write a Structure object or a mmCIF dictionary as a mmCIF file.
Examples
--------
>>> from Bio.PDB import MMCIFParser
>>> from Bio.PDB.mmcifio import MMCIFIO
>>> parser = MMCIFParser()
>>> structure = parser.get_structure("1a8o", "PDB/1A8O.cif")
>>> io=MMCIFIO()
>>> io.set_structure(structure)
>>> io.save("bio-pdb-mmcifio-out.cif")
>>> import os
>>> os.remove("bio-pdb-mmcifio-out.cif") # tidy up
"""
def __init__(self):
"""Initialise."""
pass
def set_dict(self, dic):
"""Set the mmCIF dictionary to be written out."""
self.dic = dic
# Remove self.structure if it has been set
if hasattr(self, "structure"):
delattr(self, "structure")
def save(self, filepath, select=_select, preserve_atom_numbering=False):
"""Save the structure to a file.
:param filepath: output file
:type filepath: string or filehandle
:param select: selects which entities will be written.
:type select: object
Typically select is a subclass of L{Select}, it should
have the following methods:
- accept_model(model)
- accept_chain(chain)
- accept_residue(residue)
- accept_atom(atom)
These methods should return 1 if the entity is to be
written out, 0 otherwise.
"""
# Similar to the PDBIO save method, we check if the filepath is a
# string for a filepath or an open file handle
if isinstance(filepath, str):
fp = open(filepath, "w")
close_file = True
else:
fp = filepath
close_file = False
# Decide whether to save a Structure object or an mmCIF dictionary
if hasattr(self, "structure"):
self._save_structure(fp, select, preserve_atom_numbering)
elif hasattr(self, "dic"):
self._save_dict(fp)
else:
raise ValueError(
"Use set_structure or set_dict to set a structure or dictionary to write out"
)
if close_file:
fp.close()
def _save_dict(self, out_file):
# Form dictionary where key is first part of mmCIF key and value is list
# of corresponding second parts
key_lists = {}
for key in self.dic:
if key == "data_":
data_val = self.dic[key]
else:
s = re.split(r"\.", key)
if len(s) == 2:
if s[0] in key_lists:
key_lists[s[0]].append(s[1])
else:
key_lists[s[0]] = [s[1]]
else:
raise ValueError("Invalid key in mmCIF dictionary: " + key)
# Re-order lists if an order has been specified
# Not all elements from the specified order are necessarily present
for key, key_list in key_lists.items():
if key in mmcif_order:
inds = []
for i in key_list:
try:
inds.append(mmcif_order[key].index(i))
# Unrecognised key - add at end
except ValueError:
inds.append(len(mmcif_order[key]))
key_lists[key] = [k for _, k in sorted(zip(inds, key_list))]
# Write out top data_ line
if data_val:
out_file.write("data_" + data_val + "\n#\n")
for key, key_list in key_lists.items():
# Pick a sample mmCIF value, which can be a list or a single value
sample_val = self.dic[key + "." + key_list[0]]
n_vals = len(sample_val)
# Check the mmCIF dictionary has consistent list sizes
for i in key_list:
val = self.dic[key + "." + i]
if (
isinstance(sample_val, list)
and (isinstance(val, str) or len(val) != n_vals)
) or (isinstance(sample_val, str) and isinstance(val, list)):
raise ValueError(
"Inconsistent list sizes in mmCIF dictionary: " + key + "." + i
)
# If the value is a single value, write as key-value pairs
if isinstance(sample_val, str) or (
isinstance(sample_val, list) and len(sample_val) == 1
):
m = 0
# Find the maximum key length
for i in key_list:
if len(i) > m:
m = len(i)
for i in key_list:
# If the value is a single item list, just take the value
if isinstance(sample_val, str):
value_no_list = self.dic[key + "." + i]
else:
value_no_list = self.dic[key + "." + i][0]
out_file.write(
"{k: <{width}}".format(k=key + "." + i, width=len(key) + m + 4)
+ self._format_mmcif_col(value_no_list, len(value_no_list))
+ "\n"
)
# If the value is more than one value, write as keys then a value table
elif isinstance(sample_val, list):
out_file.write("loop_\n")
col_widths = {}
# Write keys and find max widths for each set of values
for i in key_list:
out_file.write(key + "." + i + "\n")
col_widths[i] = 0
for val in self.dic[key + "." + i]:
len_val = len(val)
# If the value requires quoting it will add 2 characters
if self._requires_quote(val) and not self._requires_newline(
val
):
len_val += 2
if len_val > col_widths[i]:
col_widths[i] = len_val
# Technically the max of the sum of the column widths is 2048
# Write the values as rows
for i in range(n_vals):
for col in key_list:
out_file.write(
self._format_mmcif_col(
self.dic[key + "." + col][i], col_widths[col] + 1
)
)
out_file.write("\n")
else:
raise ValueError(
"Invalid type in mmCIF dictionary: " + str(type(sample_val))
)
out_file.write("#\n")
def _format_mmcif_col(self, val, col_width):
# Format a mmCIF data value by enclosing with quotes or semicolon lines
# where appropriate. See
# https://www.iucr.org/resources/cif/spec/version1.1/cifsyntax for
# syntax.
# If there is a newline or quotes cannot be contained, use semicolon
# and newline construct
if self._requires_newline(val):
return "\n;" + val + "\n;\n"
elif self._requires_quote(val):
# Choose quote character
if "' " in val:
return "{v: <{width}}".format(v='"' + val + '"', width=col_width)
else:
return "{v: <{width}}".format(v="'" + val + "'", width=col_width)
# Safe to not quote
# Numbers must not be quoted
else:
return "{v: <{width}}".format(v=val, width=col_width)
def _requires_newline(self, val):
# Technically the space can be a tab too
if "\n" in val or ("' " in val and '" ' in val):
return True
else:
return False
def _requires_quote(self, val):
# Technically the words should be case-insensitive
if (
" " in val
or "'" in val
or '"' in val
or val[0] in ["_", "#", "$", "[", "]", ";"]
or val.startswith("data_")
or val.startswith("save_")
or val in ["loop_", "stop_", "global_"]
):
return True
else:
return False
def _get_label_asym_id(self, entity_id):
# Convert a positive integer into a chain ID
# Goes A to Z, then AA to ZA, AB to ZB etc
# This is in line with existing mmCIF files
div = entity_id
out = ""
while div > 0:
mod = (div - 1) % 26
out += chr(65 + mod)
div = int((div - mod) / 26)
return out
def _save_structure(self, out_file, select, preserve_atom_numbering):
atom_dict = defaultdict(list)
for model in self.structure.get_list():
if not select.accept_model(model):
continue
# mmCIF files with a single model have it specified as model 1
if model.serial_num == 0:
model_n = "1"
else:
model_n = str(model.serial_num)
# This is used to write label_entity_id and label_asym_id and
# increments from 1, changing with each molecule
entity_id = 0
if not preserve_atom_numbering:
atom_number = 1
for chain in model.get_list():
if not select.accept_chain(chain):
continue
chain_id = chain.get_id()
if chain_id == " ":
chain_id = "."
# This is used to write label_seq_id and increments from 1,
# remaining blank for hetero residues
residue_number = 1
prev_residue_type = ""
prev_resname = ""
for residue in chain.get_unpacked_list():
if not select.accept_residue(residue):
continue
hetfield, resseq, icode = residue.get_id()
if hetfield == " ":
residue_type = "ATOM"
label_seq_id = str(residue_number)
residue_number += 1
else:
residue_type = "HETATM"
label_seq_id = "."
resseq = str(resseq)
if icode == " ":
icode = "?"
resname = residue.get_resname()
# Check if the molecule changes within the chain
# This will always increment for the first residue in a
# chain due to the starting values above
if residue_type != prev_residue_type or (
residue_type == "HETATM" and resname != prev_resname
):
entity_id += 1
prev_residue_type = residue_type
prev_resname = resname
label_asym_id = self._get_label_asym_id(entity_id)
for atom in residue.get_unpacked_list():
if select.accept_atom(atom):
atom_dict["_atom_site.group_PDB"].append(residue_type)
if preserve_atom_numbering:
atom_number = atom.get_serial_number()
atom_dict["_atom_site.id"].append(str(atom_number))
if not preserve_atom_numbering:
atom_number += 1
element = atom.element.strip()
if element == "":
element = "?"
atom_dict["_atom_site.type_symbol"].append(element)
atom_dict["_atom_site.label_atom_id"].append(
atom.get_name().strip()
)
altloc = atom.get_altloc()
if altloc == " ":
altloc = "."
atom_dict["_atom_site.label_alt_id"].append(altloc)
atom_dict["_atom_site.label_comp_id"].append(
resname.strip()
)
atom_dict["_atom_site.label_asym_id"].append(label_asym_id)
# The entity ID should be the same for similar chains
# However this is non-trivial to calculate so we write "?"
atom_dict["_atom_site.label_entity_id"].append("?")
atom_dict["_atom_site.label_seq_id"].append(label_seq_id)
atom_dict["_atom_site.pdbx_PDB_ins_code"].append(icode)
coord = atom.get_coord()
atom_dict["_atom_site.Cartn_x"].append(f"{coord[0]:.3f}")
atom_dict["_atom_site.Cartn_y"].append(f"{coord[1]:.3f}")
atom_dict["_atom_site.Cartn_z"].append(f"{coord[2]:.3f}")
atom_dict["_atom_site.occupancy"].append(
str(atom.get_occupancy())
)
atom_dict["_atom_site.B_iso_or_equiv"].append(
str(atom.get_bfactor())
)
atom_dict["_atom_site.auth_seq_id"].append(resseq)
atom_dict["_atom_site.auth_asym_id"].append(chain_id)
atom_dict["_atom_site.pdbx_PDB_model_num"].append(model_n)
# Data block name is the structure ID with special characters removed
structure_id = self.structure.id
for c in ["#", "$", "'", '"', "[", "]", " ", "\t", "\n"]:
structure_id = structure_id.replace(c, "")
atom_dict["data_"] = structure_id
# Set the dictionary and write out using the generic dictionary method
self.dic = atom_dict
self._save_dict(out_file)
|